Python常用功能函数系列总结(二)

kikaylee 2020-07-05

常用函数二:文本分词

方式一:jieba分词+停用词+自定义词典

# -*- coding: utf-8 -*-

"""
Datetime: 2020/06/25
Author: Zhang Yafei
Description: 文本分词
输入 停用词文件路径 词典文件路径 分词文件路径 表名(可选) 列名 分词结果列名 保存文件名
输出 分词结果-文件
"""
import os
import re

import jieba
import pandas as pd

if not os.path.exists(‘res‘):
    os.mkdir(‘res‘)


class TextCut(object):
    def __init__(self, dictionary=None, stopwords=None, ):
        self.dictionary = dictionary
        self.word_list = None
        if self.dictionary:
            jieba.load_userdict(self.dictionary)
        if stopwords:
            with open(stopwords, ‘r‘, encoding=‘utf-8‘) as swf:
                self.stopwords = [line.strip() for line in swf]
        else:
            self.stopwords = None

    @staticmethod
    def clean_txt(raw):
        file = re.compile(r"[^0-9a-zA-Z\u4e00-\u9fa5]+")
        return file.sub(‘ ‘, raw)

    def cut(self, text):
        sentence = self.clean_txt(text.strip().replace(‘\n‘, ‘‘))
        return ‘ ‘.join([i for i in jieba.cut(sentence) if i.strip() and i not in self.stopwords and len(i) > 1])

    def cut2(self, text):
        sentence = self.clean_txt(text.strip().replace(‘\n‘, ‘‘))
        return ‘ ‘.join([i for i in jieba.cut(sentence) if
                         i.strip() and i not in self.stopwords and len(i) > 1 and i in self.word_list])

    def run(self, file_path, col_name, new_col_name, to_file, sheet_name=None, word_in_dict=False):
        if sheet_name:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        else:
            df = pd.read_excel(file_path)
        if word_in_dict:
            with open(self.dictionary, encoding=‘utf-8‘) as f:
                self.word_list = [word.strip() for word in f]
            df[new_col_name] = df[col_name].apply(self.cut2)
        else:
            df[new_col_name] = df[col_name].apply(self.cut)
        df.to_excel(to_file, index=False)
        print(‘######### 处理完成 ############‘)


if __name__ == "__main__":
    # 1. 分词
    text_cut = TextCut(stopwords=‘data/stopwords.txt‘, dictionary=‘data/word_dict.txt‘)
    text_cut.run(file_path=‘data/山西政策.xlsx‘, sheet_name=‘1.21-2.20‘, col_name=‘全文‘, new_col_name=‘全文分词‘,
                 to_file=‘res/山西政策_分词.xlsx‘) 

方式二:jieba分词+信息熵合并

# -*- coding: utf-8 -*-

"""
Datetime: 2020/03/01
Author: Zhang Yafei
Description: 基于信息熵对分词结果进行合并
"""
from collections import Counter
from functools import reduce
from pandas import read_excel, DataFrame


class InfoEntropyMerge(object):
    def __init__(self, data, stopwords=‘data/stopwords.txt‘):
        self.data = data
        self.words_freq_one = {}
        self.words_freq_two = {}
        self.entropy_words_dict = {}
        if stopwords:
            with open(stopwords, ‘r‘, encoding=‘utf-8‘) as f:
                self.stopwords = {line.strip() for line in f}
        else:
            self.stopwords = None

    def count_word_freq_one(self, save_to_file=False, word_freq_file=None):
        keywords = (word for word_list in self.data for word in word_list if word)
        self.words_freq_one = Counter(keywords)
        if save_to_file:
            words = [word for word in self.words_freq_one]
            freqs = [self.words_freq_one[word] for word in words]
            words_df = DataFrame(data={‘word‘: words, ‘freq‘: freqs})
            words_df.sort_values(‘freq‘, ascending=False, inplace=True)
            words_df.to_excel(word_freq_file, index=False)

    def count_freq(self, word1, word2):
        """
        统计相邻两个词出现的频率
        :param word1:
        :param word2:
        :return:
        """
        if (word1, word2) not in self.words_freq_two:
            self.words_freq_two[(word1, word2)] = 1
        else:
            self.words_freq_two[(word1, word2)] += 1
        return word2

    def count_word_freq_two(self, save_to_file=False, word_freq_file=None):
        """
        计算相邻两个词出现的频率
        :param save_to_file:
        :param word_freq_file:
        :return:
        """
        for word_list in self.data:
            reduce(self.count_freq, word_list)
        if save_to_file and word_freq_file:
            words_list = [(word1, word2) for word1, word2 in self.words_freq_two]
            freqs = [self.words_freq_two[w1_w2] for w1_w2 in words_list]
            words_df = DataFrame(data={‘word‘: words_list, ‘freq‘: freqs})
            words_df.sort_values(‘freq‘, ascending=False, inplace=True)
            words_df.to_excel(word_freq_file, index=False)

    @staticmethod
    def is_chinese(word):
        for ch in word:
            if ‘\u4e00‘ <= ch <= ‘\u9fff‘:
                return True
        return False

    def clac_entropy(self, save_to_file=False, dict_path=‘data/entropy_dict.txt‘):
        """
        计算信息熵: E(w1, w2) = P(w1,w2)/min(P(w1),P(w2))
        :param save_to_file: 是否将熵值大于0.5的新词保存到文件中
        :param dict_path: 保存字典路径
        :return:
        """
        for word1, word2 in self.words_freq_two:
            freq_two = self.words_freq_two[(word1, word2)]
            freq_one_min = min(self.words_freq_one[word1], self.words_freq_one[word2])
            freq_one_max = max(self.words_freq_one[word1], self.words_freq_one[word2])
            w1_w2_entropy = freq_two / freq_one_max
            if self.stopwords:
                if w1_w2_entropy > 0.5 and word1 not in self.stopwords and word2 not in self.stopwords and self.is_chinese(word1) and self.is_chinese(word2):
                    # print(word1, word2, freq_two, freq_one_min, freq_one_max)
                    self.entropy_words_dict[word1+word2] = w1_w2_entropy
            else:
                if w1_w2_entropy > 0.5:
                    self.entropy_words_dict[word1+word2] = w1_w2_entropy

        print(‘信息熵大于0.5的词语组合:\n‘, self.entropy_words_dict)
        if save_to_file and dict_path:
            with open(dict_path, mode=‘r+‘, encoding=‘utf-8‘) as f:
                content = f.read()
                f.seek(0, 0)
                for word in self.entropy_words_dict:
                    f.write(word+‘\n‘)
                f.write(content)
            print(f‘成功将信息熵大于0.5的词语保存到了{dict_path}中‘)


def data_read(path, col_name):
    df = read_excel(path)
    texts = df.loc[df[col_name].notna(), col_name].str.split()
    return texts


if __name__ == ‘__main__‘:
    text_list = data_read(path=‘res/国家政策_分词.xlsx‘, col_name=‘全文分词‘)
    info_entro = InfoEntropyMerge(data=text_list)
    info_entro.count_word_freq_one()
    info_entro.count_word_freq_two()
    info_entro.clac_entropy(save_to_file=False, dict_path=‘data/entropy_dict.txt‘)

经验分享:若有好的词典和停用词,优先选用方式一,否则选择方式二。

常用函数三:词频统计

# -*- coding: utf-8 -*-

"""
Datetime: 2020/06/25
Author: Zhang Yafei
Description: 统计词频
输入 文件名 列名 分割符
输出 词频统计结果-文件
"""
from collections import Counter
import pandas as pd


def count_word_freq(file_path, col_name, to_file, sep=‘; ‘, multi_table=False):
    """
    统计词频
    :param file_path: 读取文件路径
    :param col_name: 统计词频所在列名
    :param to_file: 保存文件路径
    :param sep: 词语分割符
    :param multi_table: 是否读取多张表
    :return:
    """
    if multi_table:
        datas = pd.read_excel(file_path, header=None, sheet_name=None)
        with pd.ExcelWriter(path=to_file) as writer:
            for sheet_name in datas:
                df = datas[sheet_name]
                keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split(sep) for word in word_list if word)
                words_freq = Counter(keywords)
                words = [word for word in words_freq]
                freqs = [words_freq[word] for word in words]

                words_df = pd.DataFrame(data={‘word‘: words, ‘freq‘: freqs})
                words_df.sort_values(‘freq‘, ascending=False, inplace=True)
                words_df.to_excel(excel_writer=writer, sheet_name=sheet_name, index=False)
            writer.save()
    else:
        df = pd.read_excel(file_path)
        keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split() for word in word_list if word)
        words_freq = Counter(keywords)
        words = [word for word in words_freq]
        freqs = [words_freq[word] for word in words]

        words_df = pd.DataFrame(data={‘word‘: words, ‘freq‘: freqs})
        words_df.sort_values(‘freq‘, ascending=False, inplace=True)
        words_df.to_excel(to_file, index=False)


if __name__ == ‘__main__‘:
    # 对data.xlsx所有表中的keyword列统计词频,以默认‘; ‘为分割符切割词语,统计该列分词后的词频,结果保存至res.xlsx中
    count_word_freq(file_path=‘data.xlsx‘, col_name=‘keyword‘, to_file=‘res.xlsx‘, multi_table=True)

经验分享:注意输入格式为excel文件,这也是我学习生活中常用的处理方式,直接拿去用,非常方便

另外,在我之前的一篇博客中,我介绍了Python统计词频常用的几种方式,不同的场景可以满足你各自的需求。博客传送门:https://www.cnblogs.com/zhangyafei/p/10653977.html

相关推荐