In [1]:
import pandas as pd

emotion_dict = pd.read_excel('/home/jeffmxh/py_sentiment_analyse/emotion_dict.xlsx')
emotion_dict = emotion_dict[emotion_dict.pos.apply(lambda x:x in ['noun', 'idiom'])]

In [2]:
import gensim
#model_wiki = gensim.models.Word2Vec.load("/home/jeffmxh/word2vec/wiki.zh.text.model")
#model_weibo = gensim.models.Word2Vec.load("/home/jeffmxh/word2vec/weibo.zh.text.model")
model_sougou = gensim.models.Word2Vec.load('/home/jeffmxh/word2vec/sougou_news.text.model')
#model_sougou = gensim.models.Word2Vec.load('/home/zww/word2vec/word2vec_wx')

In [3]:
'''
获取相近的词汇列表
'''
def get_nearby_word(target_word, model):
    try:
        wiki_list = model.most_similar(target_word)
        word_list = [x[0] for x in wiki_list]
        dist_list = [x[1] for x in wiki_list]
    except:
        word_list = dist_list = []
    return word_list, dist_list

In [ ]:
emotion_dict = pd.read_excel('/home/jeffmxh/py_sentiment_analyse/emotion_dict.xlsx')
emotion_dict = emotion_dict.loc[:,['word', 'polar_1']]
emotion_dict['nearby_words'] = 'None'
emotion_dict['similarity'] = 'None'
for i in range(emotion_dict.shape[0]):
    word_list, dist_list = get_nearby_word(emotion_dict.loc[i,'word'], model_sougou)
    emotion_dict.set_value(i, 'nearby_words', word_list)
    emotion_dict.set_value(i, 'similarity', dist_list)
    if i%1000==0:
        print(i, 'words finished!')


0 words finished!
1000 words finished!
2000 words finished!

In [7]:
#final_dict = emotion_dict.loc[emotion_dict['nearby_words'].apply(lambda x:len(x)>0),:]
final_dict.to_excel('extended_dict_sougou_news.xlsx', 'sheet1')
final_dict


Out[7]:
word polar_1 nearby_words similarity
0 脏乱 -1 [脏乱差, 杂乱不堪, 污浊, 破乱, 臭气熏天, 破败, 杂乱, 卫生死角, 脏, 残旧] [0.7303634881973267, 0.6193597316741943, 0.610...
2 早衰 -1 [月经不调, 更年期, 骨质疏松症, 闭经, 多毛, 老年斑, 绝经期, 口角炎, 夜盲症,... [0.6509312987327576, 0.6218754053115845, 0.620...
3 责备 -1 [责怪, 埋怨, 责骂, 做错事, 训斥, 呵斥, 奚落, 数落, 批评, 自责] [0.7649672031402588, 0.7129254341125488, 0.700...
4 贼眼 -1 [莎时, 好鞋, 鹿队, 滴溜溜, 黑眸, 秀目, 鹿是, 那双, 莲足, 似喜] [0.5989378690719604, 0.5584024786949158, 0.550...
5 战祸 -1 [民不聊生, 灾荒, 战乱, 灾祸, 外敌, 苦难, 饥寒, 浴血奋战, 亡国, 生灵涂炭] [0.5778040885925293, 0.564408540725708, 0.5314...
8 中山狼 -1 [万人敌, 严嵩之, 公孙, 妖物, 夸大之词, 虎须, 文丑, 道济, 刘盈, 杀人不眨眼] [0.4830344319343567, 0.4751688539981842, 0.464...
9 清莹 1 [碧绿, 洁白如玉, 簇簇, 翠竹, 青翠, 丝润, 峻峭, 雪白, 苍翠, 玲珑剔透] [0.6202049851417542, 0.605355978012085, 0.5924...
12 求索 1 [探求, 追寻, 永无止境, 路漫漫其修远兮, 孜孜以求, 吾将上下而求索, 追求理想, 孜... [0.606696605682373, 0.5868327617645264, 0.5798...
13 热潮 1 [风潮, 狂潮, 新高潮, 浪潮, 大潮, 潮, 投资热, 高潮, 旅游热, 掀起] [0.7391874194145203, 0.7364331483840942, 0.659...
14 仁政 1 [仁义, 孟子, 为政, 治世, 仁者, 之德, 儒家, 纲常, 之学, 荀子] [0.6483582258224487, 0.5988097190856934, 0.598...
15 荣名 1 [卑位, 氨乙基, 痰稀, 周柄华, 莫青远, 崔玉蓉, 固皇图, 荒于嬉, 邹娴, 可煮] [0.7630715370178223, 0.6984013915061951, 0.676...
16 柔腻 1 [如漆, 胎骨, 吹口, 灰中, 眼要, 全缘, 交映, 呈白, 曲廊, 暑往寒来] [0.5962815284729004, 0.5771669745445251, 0.571...
17 瑞雪 1 [徐壮志, 李宣良, 田兆运, 颜昊, 巩琳萌, 春堂, 孙彦新, 祝绪丹, 钰, 张旭光] [0.6639033555984497, 0.5953749418258667, 0.549...
19 神采 1 [光采, 贵气, 青春活力, 双眸, 娇柔, 神采飞扬, 娇媚, 炯炯有神, 美态, 野性美] [0.6916899681091309, 0.6493357419967651, 0.639...
21 盛开 1 [一朵朵, 姹紫嫣红, 吐艳, 繁花, 开满, 锦簇, 花开, 烂漫, 含苞待放, 绽放] [0.7052971124649048, 0.7051777839660645, 0.693...
22 盛誉 1 [盛名, 美誉, 称誉, 崇高威望, 即大类, 美名, 其墓壁, 胜名, 声望, 美称] [0.5861307382583618, 0.5756721496582031, 0.560...
23 诗仙 1 [太白, 李白, 黄庭坚, 诗圣, 古有, 山水诗, 苏东坡, 孟浩然, 赞曰, 白居易] [0.6643140316009521, 0.5916151404380798, 0.547...
24 爽脆 1 [鲜甜, 鲜嫩, 爽滑, 甘香, 鲜香, 甜而不腻, 香脆, 多汁, 清甜, 酥脆] [0.7476903200149536, 0.7472080588340759, 0.730...
25 素洁 1 [草绿, 布裙, 桃红, 杏黄, 对襟, 玫瑰红, 水绿, 鹅黄, 衣裙, 纱裙] [0.5787248015403748, 0.5673038363456726, 0.560...
27 株连 -1 [通敌, 迫害, 牵连, 谋反, 告发, 诬陷, 处死, 谋害, 陷害, 包庇] [0.5798708200454712, 0.5796756148338318, 0.563...
28 坠毁 -1 [坠机, 失事, 空难, 坠落在, 空军基地, 斯摩棱斯克, 撞山, 错降, 坠落, 一架] [0.7000687122344971, 0.616642951965332, 0.5769...
29 滋生 -1 [孳生, 滋生腐败, 滋长, 温床, 尘螨, 助长, 寻租, 极容易, 霉菌, 贪污腐败] [0.6457067131996155, 0.6412344574928284, 0.597...
30 自恃 -1 [骄横, 放在眼里, 跋扈, 恃强凌弱, 颐指气使, 恃, 目中无人, 清高, 专横, 之辈] [0.559232234954834, 0.5581157803535461, 0.5442...
31 走后门 -1 [拉关系, 开后门, 托关系, 跑官, 要官, 不认, 买官, 歪门邪道, 人情, 非要] [0.5731136202812195, 0.542564332485199, 0.5348...
32 阻力 -1 [阻力位, 卖压, 下档, 支撑位, 压力, 回测, 上轨, 抛压, 该位, 回档] [0.610942542552948, 0.5151865482330322, 0.5135...
33 罪状 -1 [大罪, 治罪, 列举, 昏君, 死罪, 谋反, 罪名, 欲加之罪, 罪过, 皆空] [0.5179793238639832, 0.49672284722328186, 0.49...
34 做戏 -1 [做作, 造作, 矫揉造作, 矫情, 故弄玄虚, 奸诈, 口是心非, 不正经, 正人君子, ... [0.5676361918449402, 0.5366177558898926, 0.535...
35 甜头 1 [苦头, 苦果, 尝到, 尝到了, 滋味, 商机, 回头客, 没尝够, 第一桶金, 巨额利润] [0.5142561793327332, 0.4544307589530945, 0.451...
36 头名 1 [小组第一, 榜首, 出线权, 头把交椅, 第一名, 次席, 第二名, 前二, 一分之差, ... [0.7187870740890503, 0.633434534072876, 0.6049...
41 无上 1 [荣耀, 至高无上, 无尚, 荣誉, 战吧, 骄傲, 象征, 无比, 文为证, 无与伦比] [0.72539883852005, 0.6761458516120911, 0.66392...
... ... ... ... ...
21544 订婚 1 [完婚, 结婚, 成婚, 迎娶, 喜结连理, 未婚夫, 下嫁, 拍拖, 相恋, 求婚] [0.6895933151245117, 0.6769195199012756, 0.629...
21551 艰贞 1 [邦家, 哈默斯, 吴康金, 天威英, 奥里亚, 治湿痹, 蒙台梭, 夏不为, 塔尔德, 意大] [0.7324769496917725, 0.5873437523841858, 0.535...
21552 宁缺毋滥 1 [宁缺勿滥, 完美主义, 谁抱, 而宇舶, 迁就, 贼不走空, 强求, 择偶, 金钱至上, ... [0.6624048352241516, 0.49664202332496643, 0.46...
21554 强兵富国 1 [国谈, 但庆庆, 着悦动, 袜式, 郑圣一, 自宙, 黄倾整, 李亨泽以, 孙光圻, 梁秋华] [0.5226610898971558, 0.521201491355896, 0.5173...
21557 让路 1 [人匝舌, 推迟, 绕道而行, 劳民伤财, 耽误到, 改期, 赶时间, 绕行道路, 让位, 舍弃] [0.4300824701786041, 0.38105976581573486, 0.37...
21558 人心 1 [民心, 心, 我心, 心灵, 内心, 内心深处, 心灵深处, 心弦, 心神, 的心] [0.6149510741233826, 0.5986208915710449, 0.537...
21564 塞北江南 1 [三人诉, 以岛, 秀岭, 质润, 曲廊, 张廉云, 宰相之才, 葛洪在, 姜雨辰, 毕宏] [0.6404839158058167, 0.6360913515090942, 0.631...
21576 搜肠刮肚 -1 [尽索, 写作文, 存底, 出题者, 答多, 答好, 小问, 作题, 可写, 来考] [0.602791965007782, 0.505332350730896, 0.49876...
21582 太岁头上动土 -1 [越权, 打击报复, 包庇, 非法行为, 偷排, 从严处理, 护短, 排污, 行贿受贿, 隐... [0.38582471013069153, 0.3727850914001465, 0.36...
21584 谈空说有 -1 [夜不眠, 河东狮子, 笨官, 涳, 海者, 压日何, 骨仙, 忽闻, 天挥云驱, 殿里] [0.9472858309745789, 0.6658039093017578, 0.656...
21585 谭天说地 1 [燕捞, 王忠瑞, 雪蛤官, 萌煞, 乡归, 由小笔, 入面, 吵来了, 涳, 八出] [0.5403668284416199, 0.5275372266769409, 0.525...
21586 逃犯 -1 [追逃, 通缉逃犯, 在逃犯, 通缉, 通缉犯, 部督, 疑犯, 负案在逃, 案犯, 抢劫杀人] [0.6995433568954468, 0.6962267160415649, 0.595...
21591 天下大乱 -1 [大乱, 割据, 民不聊生, 三国鼎立, 军阀割据, 群雄割据, 逐鹿中原, 朝纲, 山雨欲... [0.6270589828491211, 0.6055105924606323, 0.586...
21593 同声同气 1 [黄子华, 冇, 阿展, 哋, 讲古, 吴康民, 食神, 啰, 乜嘢, 啲] [0.4163723587989807, 0.4042702317237854, 0.401...
21596 脱轨 -1 [脱线, 起火燃烧, 黎湛铁路, 一列, 轧过, 自强号, 翻车, 幸未, 堑坡, 相撞] [0.6030473709106445, 0.5946685075759888, 0.538...
21597 伟力 1 [义兴, 聃, 益广, 文富, 凯华, 琰, 迅为, 建威, 蔡会, 秦昭襄] [0.6142561435699463, 0.6131936311721802, 0.605...
21599 卫护 1 [之液, 驭动, 垂危, 出窍, 垂危之际, 地养, 奄奄一息, 躯壳, 诚可贵, 心室扑动] [0.42099297046661377, 0.41039109230041504, 0.4...
21600 文牍主义 -1 [事务主义, 自乘, 放平的, 美便, 越垦, 工于, 出祸, 曹全碑, 阴法, 温变] [0.45226192474365234, 0.4373161494731903, 0.43...
21602 獬豸 1 [象征物, 神社, 景福宫, 菩萨像, 神宫, 图腾, 三景, 田泽湖, 圆光, 象形文字] [0.5295747518539429, 0.5145773887634277, 0.486...
21603 朽迈 -1 [保护视力, 通便, 多吃生, 生吃, 常吃点, 常吃, 肠脏, 菠菜, 脂餐, 根皮] [0.3803459107875824, 0.3793008327484131, 0.378...
21604 阳关道 1 [独木桥, 康庄大道, 歪路, 回头路, 这条路, 邪路, 歧路, 了样, 归之路, 影之路] [0.5933928489685059, 0.5786212682723999, 0.575...
21614 荧惑 -1 [蛇蛋, 离乱, 荧荧, 封泥, 钤印, 飞针, 阳燧, 每窝, 铜币, 长剑] [0.5490721464157104, 0.5290964841842651, 0.496...
21615 硬气 1 [傲气, 霸气, 张狂, 豪气, 阴柔, 唯唯诺诺, 杀气, 英气, 小家子气, 刚硬] [0.6133681535720825, 0.5702893733978271, 0.559...
21617 幽闲 1 [第十七期, 抽象派, 水粉画, 画中, 彭鸣亮, 性灵, 写实派, 写实主义, 谢志高, ... [0.4339081048965454, 0.42009398341178894, 0.41...
21619 游历 1 [遍游, 走遍, 游遍, 闯荡, 饱览, 踏遍, 周游世界, 负笈, 旅人, 云游] [0.6597310304641724, 0.6503134965896606, 0.612...
21620 友邦 1 [柏瑞, 吉年丰, 特拉卡, 兴高, 元田, 柏瑞则, 泰信, 鹏华, 柏瑞的, 柏瑞以] [0.613128125667572, 0.5874271392822266, 0.5004...
21622 余党 -1 [金兵, 狄青, 李自成, 俘, 韩信, 孔有德, 所杀, 率兵, 高演, 安禄山] [0.48866474628448486, 0.48446810245513916, 0.4...
21623 与国 1 [痛悼, 而国, 曩, 共战, 吾民, 辛梓, 家国, 其心, 兴亡, 吾国] [0.3876909613609314, 0.3320576548576355, 0.324...
21624 月饼 1 [粽子, 水饺, 绿豆糕, 粽, 礼盒, 生日蛋糕, 年夜饭, 中秋月饼, 莲香楼, 馅料] [0.6401317715644836, 0.5594436526298523, 0.558...
21625 增生 1 [增生症, 恶变, 囊性, 病变, 结节, 癌变, 剥脱, 间质, 炎性, 性病变] [0.6966867446899414, 0.6836520433425903, 0.667...

11147 rows × 4 columns


In [3]:
#####################################
# 正式版获取近义词
#####################################

def get_nearby_word(target_word, model):
    try:
        wiki_list = model.most_similar(target_word)
        word_list = pd.Series([x[0] for x in wiki_list])
        dist_list = pd.Series([x[1] for x in wiki_list])
        result_list = list(word_list[dist_list>0.7])
    except:
        result_list = []
    return result_list

#get_nearby_word('神采', model_wiki)

In [9]:
pos_list = list(emotion_dict.loc[emotion_dict.polar_1>0,'word'])
neg_list = list(emotion_dict.loc[emotion_dict.polar_1<0,'word'])
full_list = list(emotion_dict.loc[emotion_dict.polar_1!=0,'word'])
print("pos_list:",len(pos_list))
print("neg_list:",len(neg_list))
print("full_list:",len(full_list))


pos_list: 7622
neg_list: 7319
full_list: 14941

In [10]:
full_pos_list = pos_list.copy()
for i,word in enumerate(pos_list):
    if i%1000==0:
        print(i, 'words finished!')
    extend_list = get_nearby_word(word, model_sougou)
    if len(extend_list)==0:
        continue
    else:  
        for new_word in extend_list:
            if new_word not in full_list:
                full_pos_list.append(new_word)


0 words finished!
1000 words finished!
2000 words finished!
3000 words finished!
4000 words finished!
5000 words finished!
6000 words finished!
7000 words finished!

In [71]:
full_neg_list = neg_list.copy()
for i,word in enumerate(neg_list):
    if i%1000==0:
        print(i, 'words finished!')
    extend_list = get_nearby_word(word, model_sougou)
    if len(extend_list)==0:
        continue
    else:  
        for new_word in extend_list:
            if new_word not in full_list:
                full_neg_list.append(new_word)


0 words finished!
1000 words finished!
2000 words finished!
3000 words finished!
4000 words finished!
5000 words finished!
6000 words finished!
7000 words finished!
8000 words finished!
9000 words finished!
10000 words finished!

In [13]:
print(len(pos_list))
print(len(full_pos_list))
pos_new = [x for x in full_pos_list if x not in pos_list]
len(pos_new)


7622
8749
Out[13]:
1127

In [74]:
print(len(neg_list))
print(len(full_neg_list))
neg_new = [x for x in full_neg_list if x not in neg_list]
#print(neg_new)


10543
17248

In [12]:
def emotion_list_classify(word_list, pos_list, neg_list):
    pos_count = len([x for x in word_list if x in pos_list])
    neg_count = len([x for x in word_list if x in neg_list])
    return pos_count - neg_count
#emotion_list_classify(get_nearby_word(a[0], model_sougou), pos_list, neg_list)

In [15]:
#print(get_nearby_word(a[2], model_sougou))
extend_check_table = pd.DataFrame()
extend_check_table['extend_word'] = pos_new
extend_check_table['nearby_words'] = 'None'
extend_check_table['classify_polar'] = 'None'
#extend_check_table
for i in range(extend_check_table.shape[0]):
    word_list = get_nearby_word(extend_check_table.loc[i,'extend_word'], model_sougou)
    extend_check_table.set_value(i, 'nearby_words', word_list)
    extend_check_table.set_value(i, 'classify_polar', emotion_list_classify(word_list, pos_list, neg_list))
    if i%200==0:
        print(i, 'words finished!')


0 words finished!
200 words finished!
400 words finished!
600 words finished!
800 words finished!
1000 words finished!

In [18]:
extend_check_table[extend_check_table.classify_polar>0]


Out[18]:
extend_word nearby_words classify_polar
0 风潮 [热潮] 1
1 狂潮 [热潮] 1
2 卑位 [荣名] 1
3 小组第一 [头名] 1
4 离骚 [楚辞, 名篇, 佳句, 苏轼, 绝句, 辛弃疾, 陆游, 诗句, 刘禹锡, 吟咏] 1
5 名篇 [离骚, 绝句, 古诗, 名句, 楚辞, 佳句, 律诗, 岳阳楼记] 2
7 诗句 [诗中, 绝句, 名句, 古诗, 两句诗, 辛弃疾, 陶渊明, 苏轼, 离骚, 陆游] 1
8 所见所闻 [见闻] 1
9 清玩 [文房, 珍玩] 1
10 景观带 [景观, 绿廊, 生态景观] 1
11 园林景观 [景观, 水景] 1
12 水景 [园林景观, 景观] 1
13 飒爽英姿 [英姿飒爽] 1
14 精彩纷呈 [异彩纷呈, 高潮迭起] 1
15 叫绝 [拍案叫绝, 称绝, 叹为观止, 目不暇给, 咂舌的, 乍舌的, 乍舌地, 目眩神迷, 膛目... 2
16 惊叹不已 [惊叹, 叹为观止, 赞叹不已, 啧啧称奇, 心驰神往, 折服, 大开眼界] 1
18 惊叹 [赞叹, 惊叹不已, 叹服, 叹为观止, 折服, 感叹, 赞叹不已] 1
19 心驰神往 [流连忘返, 目眩神迷, 心醉神迷, 拍案叫绝, 心旷神怡, 回味无穷, 留连忘返, 神往,... 2
20 称绝 [叫绝, 拍案叫绝, 乍舌的, 咂舌的, 乍舌地, 刮目, 目不暇给, 膛目, 肥健, 需系] 1
21 首屈一指 [数一数二] 1
22 名符其实 [名副其实] 1
23 革命先烈 [缅怀, 先烈] 1
24 先进个人 [先进集体, 标兵] 1
25 不朽 [丰碑] 1
26 荣辱与共 [肝胆相照] 1
27 韵味 [风韵, 神韵, 古韵, 韵致] 2
28 古韵 [古风, 古意, 古朴, 神韵, 韵味, 韵致] 1
29 孟子 [荀子, 朱熹, 礼记, 之学, 圣人, 孔夫子] 1
30 风水宝地 [宝地] 1
31 繁荣富强 [繁荣昌盛] 1
... ... ... ...
1039 华滋 [古拙, 浑厚, 气韵, 苍润, 笔力, 苍劲, 意韵, 幽远, 蕴藉, 空灵] 1
1040 笔力 [苍劲, 笔法, 结体, 运笔, 蕴藉, 气韵, 苍润, 华滋, 古拙, 遒劲] 1
1041 雄浑 [大气磅礴, 磅礴, 气势磅礴, 苍劲, 雄健, 刚劲, 气韵, 浑厚, 刚健, 壮阔] 2
1042 意韵 [韵致, 意趣, 气韵, 华滋, 意蕴, 古拙] 2
1043 刚劲 [雄健, 雄浑, 气韵, 刚健, 苍劲] 1
1044 刚健 [雄健, 雄浑, 刚劲, 苍劲, 古拙, 气韵] 1
1046 国与家 [历览, 前贤, 败由奢] 1
1047 历览 [国与家, 前贤] 1
1048 感人至深 [感人肺腑, 催人泪下, 感人, 感天动地, 可歌可泣] 2
1049 感人肺腑 [感人至深, 催人泪下, 感天动地, 感人] 1
1050 气壮山河 [感天动地] 1
1051 献给党 [颂歌] 1
1052 父皇 [皇上, 玄宗, 武帝, 高力士, 杨国忠, 太后, 圣上, 晋武帝, 李隆基, 群臣] 1
1055 蚜虫 [天牛, 虱, 虫害, 害虫, 枯萎病, 红蜘蛛, 蛾, 益虫] 1
1056 惯看 [秋月春风, 江渚上] 1
1057 之寿 [松柏之茂] 1
1068 优良品种 [良种] 1
1081 历史使命 [使命] 1
1092 海誓山盟 [山盟海誓] 1
1093 花前月下 [山盟海誓] 1
1107 感人事迹 [事迹, 英雄事迹, 先进事迹] 1
1108 英雄事迹 [感人事迹, 事迹, 先进事迹] 1
1109 先进事迹 [感人事迹, 事迹, 英雄事迹] 1
1110 美誉 [美称, 美名] 1
1111 刘洋贺 [月里嫦娥, 泪亦多, 干戈] 1
1112 泪亦多 [月里嫦娥, 刘洋贺] 1
1123 举手投足 [眉宇, 一颦一笑] 1
1124 保卫祖国 [保家卫国] 1
1125 山重水复 [疑无路, 柳暗花明又一村] 1
1126 疑无路 [山重水复, 柳暗花明又一村] 1

804 rows × 3 columns


In [13]:
from pickle import dump

dump(full_pos_list, open('full_pos_list.pickle', 'wb'))

In [14]:
full_neg_list = neg_list.copy()
for i,word in enumerate(neg_list):
    if i%200==0:
        print(i, 'words finished!')
    extend_list = get_nearby_word(word, model_wiki)
    if len(extend_list)==0:
        continue
    else:  
        for new_word in extend_list:
            if new_word not in full_list:
                full_neg_list.append(new_word)


0 words finished!
200 words finished!
400 words finished!
600 words finished!
800 words finished!
1000 words finished!
1200 words finished!
1400 words finished!
1600 words finished!
1800 words finished!
2000 words finished!
2200 words finished!
2400 words finished!
2600 words finished!
2800 words finished!
3000 words finished!
3200 words finished!
3400 words finished!
3600 words finished!
3800 words finished!
4000 words finished!
4200 words finished!
4400 words finished!
4600 words finished!
4800 words finished!
5000 words finished!
5200 words finished!
5400 words finished!
5600 words finished!
5800 words finished!
6000 words finished!
6200 words finished!
6400 words finished!
6600 words finished!
6800 words finished!
7000 words finished!
7200 words finished!
7400 words finished!
7600 words finished!
7800 words finished!
8000 words finished!
8200 words finished!
8400 words finished!
8600 words finished!
8800 words finished!
9000 words finished!
9200 words finished!
9400 words finished!
9600 words finished!
9800 words finished!
10000 words finished!
10200 words finished!
10400 words finished!

In [15]:
print(len(neg_list))
print(len(full_neg_list))


10543
19672

In [16]:
dump(full_neg_list, open('full_neg_list.pickle', 'wb'))

In [4]:
from pickle import load

full_pos_list = load(open('full_pos_list.pickle', 'rb'))
full_neg_list = load(open('full_neg_list.pickle', 'rb'))
print(len(full_pos_list))
print(len(full_neg_list))


22120
19672

In [6]:
inter_list = list(set(full_pos_list).intersection(set(full_neg_list)))
print(len(inter_list))
inter_list = [x for x in inter_list if x not in full_list]
print(len(inter_list))


3235
2501

In [11]:
with open('inter_sentiment_dict.txt', 'w') as f:
    for word in inter_list:
        f.write(word)
        f.write('\n')

In [7]:
#print(full_neg_list)

full_pos_list = [x for x in full_pos_list if x not in inter_list]
full_neg_list = [x for x in full_neg_list if x not in inter_list]

In [65]:
print(len(full_pos_list))
print(len(full_neg_list))


13272
11678

In [66]:
with open('full_pos_dict_sougou.txt', 'w') as f:
    for word in full_pos_list:
        f.write(word)
        f.write('\n')

with open('full_neg_dict_sougou.txt', 'w') as f:
    for word in full_neg_list:
        f.write(word)
        f.write('\n')

In [6]:
from pickle import load
full_pos_dict = load(open('full_pos_list.pickle', 'rb'))
len(full_pos_dict)
full_neg_dict = load(open('full_neg_list.pickle', 'rb'))
len(full_neg_dict)


Out[6]:
16437

In [1]:
import pandas as pd

data = pd.read_stata('/home/jeffmxh/Work_Retirement_and_Pension.dta')
data.shape


Out[1]:
(18385, 717)

In [2]:
data.columns[0:5]


Out[2]:
Index(['ID', 'householdID', 'communityID', 'xrtype', 'zf1'], dtype='object')

In [3]:
data


Out[3]:
ID householdID communityID xrtype zf1 zf5 zf6 zf7 zf12 zf13 ... fn098_w2s3 fn098_w2s4 fn098_w2s5 fn098_w2s6 fn098_w2s7 fn098_w2s8 fn098_w2s9 fn098_w2s10 fn098_w2s11 versionID
0 010104101001 0101041010 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
1 010104101002 0101041010 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
2 010104102001 0101041020 0101041 2 REIW Fowllup survey 4 Not Work NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
3 010104102002 0101041020 0101041 2 REIW Fowllup survey 4 Not Work NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
4 010104103001 0101041030 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
5 010104103002 0101041030 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
6 010104104001 0101041040 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
7 010104104002 0101041040 0101041 2 REIW Fowllup survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
8 010104105001 0101041050 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... 3 XChildName[3] NaN NaN NaN NaN NaN NaN NaN NaN 20151118
9 010104105002 0101041050 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... 3 XChildName[3] NaN NaN NaN NaN NaN NaN NaN NaN 20151118
10 010104106001 0101041060 0101041 2 REIW Fowllup survey 4 Not Work NaN NaN NaN 1 Have History of Work NaN ... 3 XChildName[3] NaN NaN NaN NaN NaN NaN NaN NaN 20151118
11 010104107001 0101041070 0101041 2 REIW Fowllup survey 4 Not Work NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
12 010104107002 0101041070 0101041 2 REIW Fowllup survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
13 010104108001 0101041080 0101041 2 REIW Fowllup survey 4 Not Work NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
14 010104108002 0101041080 0101041 2 REIW Fowllup survey 4 Not Work NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
15 010104109001 0101041090 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
16 010104109002 0101041090 0101041 2 REIW Fowllup survey 3 Both Agricultural and Non-Agricultural Work NaN 1 Self-employed Work NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
17 010104110001 0101041100 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18 010104110002 0101041100 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
19 010104111001 0101041110 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
20 010104112001 0101041120 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
21 010104112002 0101041120 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
22 010104113001 0101041130 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
23 010104113002 0101041130 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
24 010104114001 0101041140 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
25 010104114002 0101041140 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
26 010104115001 0101041150 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
27 010104115002 0101041150 0101041 2 REIW Fowllup survey 4 Not Work NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
28 010104116001 0101041160 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
29 010104116002 0101041160 0101041 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18355 347763216001 3477632160 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18356 347763216002 3477632160 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18357 347763217001 3477632170 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... 3 XChildName[3] 4 XChildName[4] 5 XChildName[5] NaN NaN NaN NaN NaN NaN 20151118
18358 347763217002 3477632170 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... 3 XChildName[3] NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18359 347763218001 3477632180 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18360 347763218002 3477632180 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18361 347763219001 3477632190 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18362 347763219002 3477632190 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18363 347763220001 3477632200 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18364 347763220002 3477632200 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18365 347763221001 3477632210 3477632 2 REIW Fowllup survey 2 Only Non-agricultural Work 1 Employed Work NaN NaN NaN 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18366 347763222001 3477632220 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18367 347763222002 3477632220 3477632 2 REIW Fowllup survey 1 Only Agricultural Work NaN NaN NaN NaN 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18368 347763223001 3477632230 3477632 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18369 347763223002 3477632230 3477632 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18370 347763224001 3477632240 3477632 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... 3 XChildName[3] 4 XChildName[4] NaN NaN NaN NaN NaN NaN NaN 20151118
18371 347763224002 3477632240 3477632 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18372 347763225001 3477632250 3477632 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... 3 XChildName[3] NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18373 347763226001 3477632260 3477632 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18374 347763226002 3477632260 3477632 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18375 347763301001 3477633010 3477633 2 REIW Fowllup survey 4 Not Work NaN NaN NaN 1 Have History of Work NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18376 347763302001 3477633020 3477633 2 REIW Fowllup survey 2 Only Non-agricultural Work NaN 1 Self-employed Work NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18377 347763302002 3477633020 3477633 2 REIW Fowllup survey 2 Only Non-agricultural Work NaN 1 Self-employed Work NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18378 347763306001 3477633060 3477633 2 REIW Fowllup survey 4 Not Work NaN NaN NaN 1 Have History of Work NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18379 347763306002 3477633060 3477633 2 REIW Fowllup survey 4 Not Work NaN NaN NaN 1 Have History of Work NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18380 347763310001 3477633100 3477633 2 REIW Fowllup survey 4 Not Work NaN NaN NaN 1 Have History of Work NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18381 347763311001 3477633110 3477633 2 REIW Fowllup survey 2 Only Non-agricultural Work NaN 1 Self-employed Work NaN 1 Have History of Work 1 Working ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18382 347763311002 3477633110 3477633 2 REIW Fowllup survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18383 347763313002 3477633130 3477633 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118
18384 347763314002 3477633140 3477633 1 NEWIW New survey NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 20151118

18385 rows × 717 columns


In [33]:
health_data = data.iloc[:,95:374]
id_data = data.iloc[:,0:5]

In [45]:
whole_data = pd.merge(id_data, health_data, left_index=True, right_index=True)

In [4]:
writer = pd.ExcelWriter("work_data_charls.xlsx")
data.to_excel(writer, sheet_name='sheet1', encoding='utf-8', index=False)
writer.save()

In [4]:
import os
from os import path

fpath = os.getcwd()
path.join(fpath, "output", "result.xlsx")


Out[4]:
'/home/da/nlp/output/result.xlsx'

In [25]:
import re
import numpy as np
import pymysql
import pandas as pd

def re_sub(text_l):
    '替换文本中的超链接和多余的空格'
    if isinstance(text_l, str) and (text_l is not None):
        text_s = re.sub('\s+', ' ', text_l)
        text_s = re.sub(' ', ',', text_s)
        text_s = re.sub('#.+?#|\[.+?]|【.+?】', '', text_s)
        text_s = re.sub('https?:[a-zA-Z\\/\\.0-9_]+', '', text_s)
        text_s = re.sub('@.+?[,,::\ )]|@.+?$', '', text_s)
        text_s = re.sub('我在(\\w){0,2}[::](\\w*)', '', text_s)
        text_s = re.sub('\\[(\\w){1,4}\\]', '', text_s)
        text_s = re.sub('&[a-z]+;', '', text_s)
    else:
        text_s = str(text_l)
        text_s = re_sub(text_s)
    return text_s
re_sub_vec = np.vectorize(re_sub)

In [26]:
def get_db_data(query_str):
    conn = pymysql.connect(host='127.0.0.1',port=3306,user='analyzer',password='analyzer@tbs2016',database='dp_relation',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
    cur=conn.cursor()
    doc = pd.read_sql_query(query_str, conn)
    doc = pd.DataFrame(doc)
    cur.close()
    conn.close()
    return doc

In [27]:
data = get_db_data("select * from emotion_analyse where keyword_id='36_1'")

In [28]:
data['content'] = re_sub_vec(data['content'])

In [32]:
print(data['content'][2])


“暖暖,,月经已经推迟了10天,而且最近胃胀气,老是想打嗝,应该怎么办啊?对了,我一直火气挺大,随便吃点什么就上火,上个月吃了很多下火的药啊,猕猴桃,火龙果,还经常喝菊花茶。。会不会都有影响?”,你这次月经推迟和最近的胃胀气,跟吃下火药、这些水果和菊花茶有关。春天上火的时候,别着急下,​

In [22]:
re_sub("最近胃里反酸水,烧心,吃啥都没味道[失望][失望]还要加班写比赛材料,新校长对青年教师的要求那不是一般的高[摊手][摊手]真的好多事要做,一天48小时都不一定够用[抓狂][抓狂] ​")


Out[22]:
'最近胃里反酸水,烧心,吃啥都没味道还要加班写比赛材料,新校长对青年教师的要求那不是一般的高真的好多事要做,一天48小时都不一定够用,\u200b'

In [34]:
import time

localtime = time.asctime( time.localtime(time.time()) )
print("本地时间为 :", localtime)


本地时间为 : Mon Apr 17 17:07:46 2017