python从简介中获取行业分类

文山羊 2020-03-14

这一部分我是请教的同学完成的这部分内容

其思路是首先从简介中分析获取关键词,这部分在上篇博客中,这里不再详细说明,然后分析数据中已有的关键词和行业的关系,然后分析未知关键词的行业分类

这个方法会让每个关键词都匹配出很多的行业分类,因此我将每条数据匹配出的所有行业分类存入一个列表中,然后得到出现在次数最多的行业分类

行业代码是根据2011版的代码进行的搜索匹配

完整代码如下

import re
import kejichengguo.sql as SQL


def getindustry(trains, datas):
    train_keywords=[]
    train_id=[]
    for train in trains:
        train_keywords.append(train[‘keyword‘])
        train_id.append(train[‘id‘])
    test_keyword=[]
    test_industry=[]
    for data in datas:
        test_keyword.append(data[‘keyword‘])
        test_industry.append(data[‘industry‘])
    num_train=0
    for train_keyword in train_keywords:
        if(train_keyword==None):
            num_train = num_train + 1
            continue
        industry_list = []
        lists = train_keyword.split()
        for list in lists:
            num_i=0
            for test_pi in test_keyword:
                if(test_pi==None):
                    continue
                str1 = re.findall(r""+list+"",test_pi )
                if (len(str1)!=0):
                    ID=train_id[num_train]
                    midID=str(ID)
                    industry_1=test_industry[num_i]
                    industry_2=industry_1.split()
                    for industry_3 in industry_2:
                        industry_list.append(industry_3)
                num_i = num_i + 1
        try:
            industry = max(industry_list[0:20], key=industry_list[0:20].count)
            print(midID, industry)
            SQL.updateindustry(industry, midID)
        except:
            print(midID, industry_list)
        num_train=num_train+1


def getindustrycode(datas):
    num = 0
    for data in datas:
        num_list = ""
        lists = data[‘industry‘].split()
        for list in lists:
            list = list[0:2]
            if list == "新型":
                list = "材料"
            Value_code = SQL.select_industrycode(list)
            if (len(Value_code) > 0):
                num_list = num_list + " " + Value_code[0][‘code‘]
        print(data[‘id‘], data[‘industry‘], num_list)
        SQL.updateindustrycode(num_list, data[‘id‘])
        num = num + 1

if __name__==‘__main__‘:
    # trains= SQL.select_keyword()
    # datas= SQL.select_pi_keyword()
    # getindustry(trains, datas)
    datas = SQL.select_industry()
    getindustrycode(datas)

完整代码