文山羊 2020-03-14
这一部分我是请教的同学完成的这部分内容
其思路是首先从简介中分析获取关键词,这部分在上篇博客中,这里不再详细说明,然后分析数据中已有的关键词和行业的关系,然后分析未知关键词的行业分类
这个方法会让每个关键词都匹配出很多的行业分类,因此我将每条数据匹配出的所有行业分类存入一个列表中,然后得到出现在次数最多的行业分类
行业代码是根据2011版的代码进行的搜索匹配
完整代码如下
import re import kejichengguo.sql as SQL def getindustry(trains, datas): train_keywords=[] train_id=[] for train in trains: train_keywords.append(train[‘keyword‘]) train_id.append(train[‘id‘]) test_keyword=[] test_industry=[] for data in datas: test_keyword.append(data[‘keyword‘]) test_industry.append(data[‘industry‘]) num_train=0 for train_keyword in train_keywords: if(train_keyword==None): num_train = num_train + 1 continue industry_list = [] lists = train_keyword.split() for list in lists: num_i=0 for test_pi in test_keyword: if(test_pi==None): continue str1 = re.findall(r""+list+"",test_pi ) if (len(str1)!=0): ID=train_id[num_train] midID=str(ID) industry_1=test_industry[num_i] industry_2=industry_1.split() for industry_3 in industry_2: industry_list.append(industry_3) num_i = num_i + 1 try: industry = max(industry_list[0:20], key=industry_list[0:20].count) print(midID, industry) SQL.updateindustry(industry, midID) except: print(midID, industry_list) num_train=num_train+1 def getindustrycode(datas): num = 0 for data in datas: num_list = "" lists = data[‘industry‘].split() for list in lists: list = list[0:2] if list == "新型": list = "材料" Value_code = SQL.select_industrycode(list) if (len(Value_code) > 0): num_list = num_list + " " + Value_code[0][‘code‘] print(data[‘id‘], data[‘industry‘], num_list) SQL.updateindustrycode(num_list, data[‘id‘]) num = num + 1 if __name__==‘__main__‘: # trains= SQL.select_keyword() # datas= SQL.select_pi_keyword() # getindustry(trains, datas) datas = SQL.select_industry() getindustrycode(datas)
完整代码