In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import re
import numpy as np
import jieba
In [2]:
import pymysql
def get_db_data(query_str):
conn = pymysql.connect(host='127.0.0.1',port=3306,user='analyzer',password='analyzer@tbs2016',database='dp_relation',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
cur=conn.cursor()
doc = pd.read_sql_query(query_str, conn)
doc = pd.DataFrame(doc)
cur.close()
conn.close()
return doc
In [3]:
'''
微博文本过滤
'''
def re_sub(text_l):
'替换文本中的超链接和多余的空格'
if isinstance(text_l, str) and (text_l is not None):
text_s = re.sub('\s+', ' ', text_l)
text_s = re.sub(' ', ',', text_s)
text_s = re.sub('#.+?#|\[.+?]|【.+?】', '', text_s)
text_s = re.sub('https?:[a-zA-Z\\/\\.0-9_]+', '', text_s)
text_s = re.sub('@.+?[,,::\ )]|@.+?$', '', text_s)
text_s = re.sub('我在(\\w){0,2}[::](\\w*)', '', text_s)
text_s = re.sub('\\[(\\w){1,4}\\]', '', text_s)
text_s = re.sub('&[a-z]+;', '', text_s)
else:
text_s = str(text_l)
text_s = re_sub(text_s)
return text_s
re_sub_vec = np.vectorize(re_sub) # 函数向量化
In [4]:
data = get_db_data("SELECT * FROM emotion_analyse WHERE project_id=35 AND keyword_id='35_1'")
data.content = re_sub_vec(data.content)
In [5]:
#data = pd.read_excel('/home/jeffmxh/sentiment_result.xlsx')
#data.content = re_sub_vec(data.content)
#data = data.loc[:,['content']]
#data.loc[1:10:]
In [6]:
'''
把每一条评论分割成单独的句子
'''
def sentence_split(content):
sentence = str(content)
sentence = re.sub('\u200b', '', sentence)
result = re.split('。|?|!|\\.|\\?|\\!', sentence)
return [ele for ele in result if len(ele)>1]
In [7]:
data['content_list'] = data['content'].map(sentence_split)
data.loc[1:2,:]
Out[7]:
In [13]:
# 用于分词的类
import os
import re
from os import path
import jieba
class jieba4null():
"""
docstring for parser_word
deal处理文本,返回词表、词性及依存关系三个值
"""
def __init__(self,n_core = 16):
self.rootdir = os.getcwd()
#self.STOP_WORDS_LIST = self.load_txt('/home/jeffmxh/stopwords_utf8.txt')
#self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST])
jieba.load_userdict('/home/jeffmxh/emotion_user_dict.txt')
self.n_CORE=n_core
jieba.enable_parallel(self.n_CORE-1)
def filter_stop(self,input_text):
for token in input_text:
if token not in self.STOP_WORDS_LIST:
yield token
def cut_word(self,sent):
#words = self.filter_stop(jieba.cut(sent, cut_all=False))
words = jieba.cut(sent, cut_all=False)
result = list(words)
return list(filter(lambda x:x!='\u200b', result))
def cut_sentence(self, sent_list):
result = []
for sent in sent_list:
result.append(list(self.cut_word(sent)))
return result
def load_txt(self,file):
with open(file,'r',encoding = 'utf-8') as f_h:
res = [line.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for line in f_h]
return res
In [14]:
seg_word = jieba4null()
data.loc[:,'seg_words'] = data['content_list'].map(seg_word.cut_sentence)
#data
In [15]:
data.loc[:,['content','seg_words']]
Out[15]:
In [10]:
import pandas as pd
class polar_classifier():
'''
用于对句子列表进行极性分析的类
'''
def __init__(self):
self.pos_list = self.load_txt('/home/jeffmxh/full_pos_dict_sougou.txt')
self.neg_list = self.load_txt('/home/jeffmxh/full_neg_dict_sougou.txt')
self.degree_dict = pd.read_excel('/home/jeffmxh/py_sentiment_analyse/degree_dict.xlsx')
self.deny_dict = ['不', '不是', '没有']
def load_txt(self,file):
with open(file,'r',encoding = 'utf-8') as f_h:
res = [line.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for line in f_h]
result = [re.sub('\n', '', item) for item in res]
return result
# 鉴定词汇的情感极性,输入词汇以及正负列表
def word_polar_classify(self, word, pos_list, neg_list):
if word in pos_list:
return 1
elif word in neg_list:
return -1
else:
return 0
# 鉴定程度副词,degree:1~6
def word_strength_classify(self, word, degree_dict):
sub_dict = degree_dict.loc[degree_dict.word==word,:]
if sub_dict.shape[0]==0:
return 0
else:
return sub_dict.iloc[0,1]
# 鉴定否定词
def word_deny_classify(self, word, deny_dict):
if word in deny_dict:
return -1
else:
return 1
# 分析单个列表词汇
def single_list_classify(self, seg_list):
sign = 1
k = 1
result_list = []
for i,word in enumerate(seg_list):
polar_temp = self.word_polar_classify(word, self.pos_list, self.neg_list)
if polar_temp!=0:
result_temp = polar_temp * sign * k
result_list.append(result_temp)
else:
sign *= self.word_deny_classify(word, self.deny_dict)
k += self.word_strength_classify(word, self.degree_dict)
if len(result_list)==0:
return 'None'
else:
return sum(result_list)
# 分析多个列表词汇
def multi_list_classify(self, big_seg_list):
res = []
for seg_list in big_seg_list:
res.append(self.single_list_classify(seg_list))
senti_list = [x for x in res if x!='None']
if len(senti_list)==0:
return 'None'
else:
return sum(senti_list)
In [11]:
worker = polar_classifier()
#worker.multi_list_classify(data.seg_words[40])
data['polar'] = data['seg_words'].map(worker.multi_list_classify)
In [12]:
data = data.drop(['content_list','seg_words'], axis = 1)
data.loc[0:2,:]
Out[12]:
In [13]:
data.columns
Out[13]:
In [14]:
data.loc[1:2,:]
Out[14]:
In [25]:
def update_sql_polar(emotion_result):
conn = pymysql.connect(host='127.0.0.1',
port=3306,
user='analyzer',
password='analyzer@tbs2016',
database='dp_relation',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
for i in range(emotion_result.shape[0]):
sql_update = ("UPDATE emotion_analyse SET polar='%s' where id=%d"
% (str(emotion_result.polar[i]), emotion_result.id[i]))
cur = conn.cursor()
bool_resu = cur.execute(sql_update)
conn.commit()
conn.close()
In [12]:
writer = pd.ExcelWriter('emotion_result.xlsx')
data.to_excel(writer, sheet_name='sheet1', encoding='utf-8', index=False)#写入excel
writer.save()