文山羊 2020-03-14
这一部分我是请教的同学完成的这部分内容
其思路是首先从简介中分析获取关键词,这部分在上篇博客中,这里不再详细说明,然后分析数据中已有的关键词和行业的关系,然后分析未知关键词的行业分类
这个方法会让每个关键词都匹配出很多的行业分类,因此我将每条数据匹配出的所有行业分类存入一个列表中,然后得到出现在次数最多的行业分类
行业代码是根据2011版的代码进行的搜索匹配
完整代码如下
import re
import kejichengguo.sql as SQL
def getindustry(trains, datas):
train_keywords=[]
train_id=[]
for train in trains:
train_keywords.append(train[‘keyword‘])
train_id.append(train[‘id‘])
test_keyword=[]
test_industry=[]
for data in datas:
test_keyword.append(data[‘keyword‘])
test_industry.append(data[‘industry‘])
num_train=0
for train_keyword in train_keywords:
if(train_keyword==None):
num_train = num_train + 1
continue
industry_list = []
lists = train_keyword.split()
for list in lists:
num_i=0
for test_pi in test_keyword:
if(test_pi==None):
continue
str1 = re.findall(r""+list+"",test_pi )
if (len(str1)!=0):
ID=train_id[num_train]
midID=str(ID)
industry_1=test_industry[num_i]
industry_2=industry_1.split()
for industry_3 in industry_2:
industry_list.append(industry_3)
num_i = num_i + 1
try:
industry = max(industry_list[0:20], key=industry_list[0:20].count)
print(midID, industry)
SQL.updateindustry(industry, midID)
except:
print(midID, industry_list)
num_train=num_train+1
def getindustrycode(datas):
num = 0
for data in datas:
num_list = ""
lists = data[‘industry‘].split()
for list in lists:
list = list[0:2]
if list == "新型":
list = "材料"
Value_code = SQL.select_industrycode(list)
if (len(Value_code) > 0):
num_list = num_list + " " + Value_code[0][‘code‘]
print(data[‘id‘], data[‘industry‘], num_list)
SQL.updateindustrycode(num_list, data[‘id‘])
num = num + 1
if __name__==‘__main__‘:
# trains= SQL.select_keyword()
# datas= SQL.select_pi_keyword()
# getindustry(trains, datas)
datas = SQL.select_industry()
getindustrycode(datas)完整代码