In [50]:
from pickle import load
import pandas as pd
doc_dp = load(open('/home/jeffmxh/dp_relation/raw_data/output/temp/雅诗兰黛用户微博原创_dpOlay_雅诗兰黛用户微博原创_content_outputltp_loc.pickle','rb'))
#print(doc_dp[doc_dp.content_id==98])
doc_dp = doc_dp.loc[(doc_dp.target_pos!="v") & (doc_dp.source_pos!="v") & (doc_dp.relation!="ADV")]
print(doc_dp)
In [48]:
2>1 and 3>2 and 5>4
Out[48]:
In [23]:
print(doc_dp.iloc[750, :])
In [19]:
from pickle import load
import pandas as pd
doc_dp = load(open('/home/jeffmxh/dp_relation/raw_data/output/temp/content_yili_dp美汁源-果粒奶优_伊利优酸乳-天猫旗舰1Tmall评论_content_outputltp_loc.pickle','rb'))
target_list = doc_dp['target']
length_list = list()
for i in range(len(target_list)):
length = len(target_list.iloc[i])
length_list.append(length)
if length>50:
print(i)
print(target_list.iloc[i])
print(max(length_list))
In [6]:
import os
import re
import pandas as pd
from os import path
import time
import argparse
import pymysql
import logging
import pymysql
import pandas as pd
def get_db_data(query_str):
conn = pymysql.connect(host='127.0.0.1',port=3306,user='analyzer',password='analyzer@tbs2016',database='dp_relation',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
cur = conn.cursor()
doc = pd.read_sql_query(query_str, conn)
doc = pd.DataFrame(doc)
for column in doc.columns:
column_null = pd.isnull(doc[column])
column_null_true = column_null[column_null == True]
if len(column_null_true) == len(doc):
del doc[column]
cur.close()
conn.close()
return doc
def get_data_info(project_name,keyword):
conn = pymysql.connect(host='127.0.0.1',port=3306,user='analyzer',password='analyzer@tbs2016',database='dp_relation',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
cur=conn.cursor()
sql_query = ("SELECT project_id,keyword_id,info_source_sheet FROM project_key where project_name='%s' and keyword = '%s'"%(project_name,keyword))
cur.execute(sql_query)
id_num = cur.fetchall()[0]
project_id = id_num['project_id']
keyword_id = id_num['keyword_id']
table_name = id_num['info_source_sheet']
return project_id,keyword_id,table_name
def readmysql(**kwargs):
conn = pymysql.connect(host='127.0.0.1',port=3306,user='analyzer',password='analyzer@tbs2016',database='dp_relation',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
cur=conn.cursor()
doc = pd.read_sql_query("SELECT * FROM %s where project_id=%d and keyword_id = '%s'" %(kwargs['table_name'],kwargs['project_id'],kwargs['keyword_id']), conn)
doc = pd.DataFrame(doc)
for column in doc.columns:
column_null = pd.isnull(doc[column])
column_null_true = column_null[column_null == True]
if len(column_null_true) == len(doc):
del doc[column]
cur.close()
conn.close()
return doc
def remove_abnormal_user(project_id, document):
abnormal_user = get_db_data("SELECT id,origin_id,screen_name FROM weibo_users WHERE project_id=%d AND is_abnormal=1"%project_id)
if(len(abnormal_user)!=0):
if(table_name=='weibo_raw_data'):
document_filter = document.loc[document['user_id'].map(lambda x:x not in list(abnormal_user['id']))]
elif(table_name=='weibo_messages'):
document_filter = document.loc[document['weibo_user_origin_id'].map(lambda x:x not in list(abnormal_user['origin_id']))]
else:
document_filter = document
return document_filter
project_id,keyword_id,table_name = get_data_info('华为P10_170511', 'iphone7_京东评论')
document = readmysql(project_id = project_id, table_name = table_name, keyword_id = keyword_id)
document.loc[:,'content_id'] = pd.Series(list(range(len(document))))
# document['content'] = list(map(text_mechanical_removal.main,document['content']))
document.content
Out[6]:
In [28]:
import re
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
illegal_sub = lambda x:ILLEGAL_CHARACTERS_RE.sub('', x)
document['content'].map(illegal_sub)
illegal_match = lambda x:ILLEGAL_CHARACTERS_RE.match(x)
document['content_sub'] = document['content'].map(illegal_sub)
a = document.loc[document['content']!=document['content_sub'],['content','content_sub']]
print(document.content[2358])
print(document.content_sub[2358])
print(a.content_sub)
a
# document['content'][document['content'].apply(illegal_match)]
ILLEGAL_CHARACTERS_RE.findall(str(document.content[2358]))
Out[28]:
In [19]:
import gensim
model = gensim.models.Word2Vec.load("/home/jeffmxh/word2vec/weibo.zh.text.model")
model.most_similar(u"足球")
Out[19]:
In [56]:
model.most_similar(u"失意")
Out[56]:
In [29]:
model.most_similar(u"失望")
Out[29]:
In [1]:
import gensim
model = gensim.models.Word2Vec.load("/home/jeffmxh/word2vec/wiki.zh.text.model")
model.most_similar(u"足球")
Out[1]: