Python-在线文本情感分析实验_python

概述'''每次处理两百'''importjieba#分词importre#过滤特殊字符importnumpyasnp#辅助处理importpandasaspd#处理importemoji#过滤表情importjieba.analyse#分级级别权重importimageio#图片importjieba.possegaspseg#词性标注fr
'''每次处理两百'''import jIEba  # 分词import re  # 过滤特殊字符import numpy as np  # 辅助处理import pandas as pd  # 处理import emoji  # 过滤表情import jIEba.analyse  # 分级级别权重import imageio  # 图片import jIEba.posseg as pseg  # 词性标注from wordcloud import WordCloud  # 词云import osimport matplotlib.pyplot as plt  # 画布import difflib  # 相似度判断import configparser  # 配置文件导入# 路径配置导入def cfg():    # 生成config对象    conf = configparser.ConfigParser()    # 用config对象读取配置文件    conf.read("config.ini" , enCoding='utf-8')    # 以列表形式返回所有的section    sections = conf.sections()    items = conf.items('filePaths')    items = dict(items)    return items# 采用关键词典简化数据def wts_dict():    wts_lst = []    with open(cfg()['wts_dict_path'], enCoding='utf-8') as f:        for line in f:            line = line.replace("\n", "").replace("\r", "")            wts_lst.append(line)    return wts_lst# 文件加载def dict_load(path):    print("文件加载！")    dt = []    with open(path, enCoding='utf-8-sig') as f:        for line in f:            if line.strip() != '':  # 去除空格                dt.append(line.strip())    return (dt)# 情感值计算def sents(sent, negdict, posdict, nodict):    pos = 0  # 积极    neg = 0  # 消极    for i in range(len(sent)):        if sent[i] in negdict:            if i == 1 and sent[i - 1] in nodict:                pos = pos + 1  # 否定-消极            elif i == 1 and sent[i - 1] not in nodict:                neg = neg + 1  # 其他-消极            elif i > 1 and sent[i - 1] in nodict:                if sent[i - 2] in nodict:                    neg = neg + 1  # 否定-否定-消极                else:                    pos = pos + 1  # 其他-否定-消极            elif i > 1 and sent[i - 1] not in nodict:                if sent[i - 2] in nodict:                    pos = pos + 1  # 否定-其他-消极                else:                    neg = neg + 1.5  # 程度-消极        elif sent[i] in posdict:            if i == 1 and sent[i - 1] in nodict:                neg = neg + 1  # 否定-积极            elif i == 1 and sent[i - 1] not in nodict:                pos = pos + 1  # 其他-积极            elif i > 1 and sent[i - 1] in nodict:                if sent[i - 2] in nodict:                    pos = pos + 1  # 否定-否定-积极                else:                    neg = neg + 1  # 其他-否定-积极            elif i > 1 and sent[i - 1] not in nodict:                if sent[i - 2] in nodict:                    neg = neg + 1  # 否定-其他-积极                else:                    pos = pos + 1.5  # 程度-积极        # print(pos, neg)    return pos, neg# 过滤表情def filter_emoji(test_str):    result = emoji.demojize(test_str)    return emoji.emojize(result)# 数据加载处理为字符串列表def pretreatment():    # 加载excel    excel = pd.read_excel(cfg()['excel_path'])   #enCoding = utf-8    # punt_List = ',.!?;~。！？；～… '.encode('utf8').decode('utf8')    # 暂时留取时间，不做处理    data = excel[['Title', 'Notes']]    # 生成单维DataFrame     并删除重复行    datafreame = pd.DataFrame(data).dropna(how='any').drop_duplicates(subset='Notes')    # 去除空值 NaN    dataToTwo = datafreame.dropna(axis=0)    # 存放comment列----》字符串列表    dataToTwoStr = []    for i in dataToTwo['Notes']:        dataToTwoStr.append(filter_emoji(str(i)))  # 表情处理    #print(dataToTwoStr)    return dataToTwoStr# 开始加载情感词典列表def first_Load():    neg_dict = []  # 消极情感词典    pos_dict = []  # 积极情感词典    no_dict = []  # 否定词词典    pos_dict = dict_load(cfg()['pos_dict_path'])    # print(pos_dict)    neg_dict = dict_load(cfg()['neg_dict_path'])    # print(neg_dict)    no_dict = dict_load(cfg()['no_dict_path'])    # print(no_dict)    return pos_dict, neg_dict, no_dict    # dicts = {,{,[]}}def comment_base_split(wts_lst, comment_base):    index = []    for i in wts_lst:        if i in comment_base:            sall_index = [r.span() for r in re.finditer(i, comment_base)]            index.append(sall_index)    # for i in index:    #     i = List(set(i))    # print(index)    index_commnet = List(index)    t = []    # print(index_commnet)    for i in index_commnet:        for j in i:            tutle = List(j)            if tutle[0] < 10:                tutle[0] = 0            else:                tutle[0] = tutle[0] - 20            if tutle[1] < 10:                tutle[1] = 20            else:                tutle[1] = tutle[1] + 20            t.append(tutle)    # print(type(index_commnet))    for s in range(len(index_commnet)):        index_commnet[s] = t    # print(index_commnet)    comment_base_split_dict = []    for i in index_commnet:        for j in i:            comment_base_split_dict.append(comment_base[j[0]:j[1]])    for i in comment_base_split_dict:        for j in comment_base_split_dict:            if get_equal_rate_1(i, j) > 0.85:  # 相似度大于0.85 删除                comment_base_split_dict.remove(j)                break    return comment_base_split_dict# 判断俩字符串相似度def get_equal_rate_1(str1, str2):    return difflib.SequenceMatcher(None, str1, str2).quick_ratio()#  最终处理def batchProcessing():    # 加载五台山关键字词典    wts_lst = wts_dict()    # 三个情感分析词典加载    pos_dict, neg_dict, no_dict = first_Load()    # 获取评论 字符串列表    inputs = pretreatment()    lst = []    for i in inputs:        Lists = []        # 单列单行评论---》过滤特殊符号        comment_base = i.replace("\n", "").replace("\r", "").replace(" ", "")        lst.append(comment_base_split(wts_lst, comment_base))        new_List = []        for i in lst:            if i not in new_List:                new_List.append(i)        num_lst = []        pos_lst = []        neg_lst = []        # --------------------------------------print(lst)        for j in new_List:            for k in j:                k = "".join(k.split())                sub_str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", k)                # 字符串中文分词                sent = jIEba.lcut(sub_str)                # 计算情感值                pos, neg = sents(sent, pos_dict, neg_dict, no_dict)                pos_lst.append(pos)                neg_lst.append(neg)                num = pos - neg                num_lst.append(num)                with open(cfg()['juzi_path'], 'a', enCoding='utf-8') as f:                    f.write(sub_str + '\n')        # 保存分析结果        strs_lst = []        strs_lst.append(comment_base)        # 总情感值（游记切割后，每句情感值列表），积极情感值列表， 消极情感值列表        outputfile(strs_lst, neg_lst, pos_lst)# 保存分析结果def outputfile(commentStr, pos, neg):    #print(commentStr)    with open(cfg().get('out'), 'a', enCoding='utf-8') as f:        # with open(cfg()['comment_ioPath'],'a',enCoding='utf-8') as f2:        #     for i in commentStr:        #         f2.write(i)        for i in commentStr:            f.write(i)        f.write("\n积极倾向值：{}".format(sum(pos)) + "\n")        f.write("消极倾向值：{}".format(sum(neg)) + "\n")        num = sum(pos) + sum(neg)        if (num > 0):            f.write("情感倾向：积极" + "\n")        elif (num < 0):            f.write("情感倾向：消极" + "\n")        else:            f.write("情感倾向：中性" + "\n")        f.write('-' * 100 + "\n")# 生成词云def toWordCloud():    # 停用词    fr = open(cfg()['cn_stopwords'], 'r', enCoding='utf-8')    stop_word_List = fr.readlines()    new_stop_word_List = []    for stop_word in stop_word_List:        stop_word = stop_word.replace('\ufeef', '').strip()        new_stop_word_List.append(stop_word)    with open(cfg()['path'], 'r', enCoding='utf-8') as f:        words = f.read()        word_dict = {}        word_List = ''        words_arr = words.split('\n')        words_jIEbas = []        for i in words_arr:            words_jIEbas.append(jIEba.lcut(i))        for words_jIEba in words_jIEbas:            for word in words_jIEba:                if (len(word) > 1 and not word in new_stop_word_List):                    word_List = word_List + ' ' + word                    if (word_dict.get(word)):                        word_dict[word] = word_dict[word] + 1                    else:                        word_dict[word] = 1        ##print(word_List)        # print(word_dict)        # 按次数进行排序        sort_words = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)        print(sort_words[0:101])  # 输出前0-100的词        #color_mask = imageio.imread(cfg()['bg_path'])        wc = WordCloud(            background_color="white",  # 背景颜色            max_words=1000,  # 显示最大词数            Font_path=cfg()['simsun_path'],  # 使用字体            min_Font_size=20,            max_Font_size=500,            random_state=42,  # 随机数            collocations=False,  # 避免重复单词            wIDth=1600,            height=1200,            margin=10,            #mask=color_mask)  # 图幅宽度        )        #wc.recolor([random_state, color_func, colormap])        wc.generate(word_List)        wc.to_file(cfg()['wordcloud_ioPath'])        plt.figure(dpi=100)        # 以图片的形式显示词云        plt.imshow(wc,interpolation='catrom',vmax=1000)        # 关闭图像坐标系        plt.axis("off")        plt.show()if __name__ == '__main__':    batchProcessing()    lst = []    with open(cfg()['juzi_path'], enCoding="utf-8-sig") as f:        for i in f:            lst.append(i)    lst = List(set(lst))    with open(cfg()['path'], 'a', enCoding='utf-8-sig') as f2:        for i in lst:            f2.write(i)    toWordCloud()
总结
以上是内存溢出为你收集整理的Python-在线文本情感分析实验全部内容，希望文章能够帮你解决Python-在线文本情感分析实验所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错，欢迎将内存溢出网站推荐给程序员好友。
欢迎分享，转载请注明来源：内存溢出
原文地址:https://www.54852.com/langs/1184620.html
Python-在线文本情感分析实验

发表评论

评论列表（0条）