In [2]:
#!/usr/bin/env pythonPython
# coding=utf-8
%matplotlib inline
import MySQLdb
import jieba
import numpy as np
import os,sys
import json
import matplotlib
from collections import Counter
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

Get all data here and delete web pages


In [3]:
with open('./Flipped.txt','r') as f:
    all_files = f.readlines()
    datalist = []
    for item in all_files:
        item_split = item.split()
        if len(item_split)>3:
            datalist.append(item)
all_data=datalist[2:]

Split time and people


In [4]:
final_dict = {}
all_list=[]
for i, item in enumerate(all_data):
    temp_dict = {}
    temp_list = []
    try:
        s = item[30:49]
        time_stamp = time.mktime(datetime.datetime.strptime(s, "%m/%d/%Y %H:%M:%S").timetuple())
    except ValueError:
        print item
    temp_dict['time_stamp'] = np.float32(time_stamp)
    temp_dict['month'] = np.int(item[30:32])
    temp_dict['day'] = np.int(item[33:35])
    temp_dict['year'] = np.int(item[36:40])
    temp_dict['hour'] = np.int(item[41:43])
    temp_dict['minute'] = np.int(item[44:46])
    temp_dict['name'] = item[55:80].replace(" ","")
    temp_dict['receive'] = item[80:100].replace(" ","")
    temp_dict['type'] = item[100:120].replace(" ","")
    temp_dict['text'] = item[120:].replace(" ","")
    for key, value in temp_dict.iteritems():
        temp_list.append(value)
    all_list.append(temp_list)
    final_dict[i] = temp_dict
print final_dict[0]
print all_list[0]
#all_data[20][0][55:]
#print all_data[20][0][80:]
#print all_data[20][0][100:]
#print all_data[20][0][120:]


{'name': 'Flipped', 'hour': 15, 'receive': 'Receive', 'text': "I'veacceptedyourfriendrequest.Nowlet'schat!\r\n", 'month': 6, 'year': 2016, 'time_stamp': 1.4654787e+09, 'type': 'Text', 'day': 9, 'minute': 23}
['Flipped', 15, 'Receive', "I'veacceptedyourfriendrequest.Nowlet'schat!\r\n", 6, 2016, 1.4654787e+09, 'Text', 9, 23]

In [4]:
#with open('result.json', 'w') as fp:
    #json.dump(final_dict, fp)
print len(all_list)


28799

I and You data


In [5]:
def get_frequency(all_list, idx_=-2):
    all_array = np.array(all_list)
    all_item =  all_array[:,idx_].astype(np.int)
    counts = np.bincount(all_item)
    return counts

def get_related_list(all_list, idx_=-2):
    all_array = np.array(all_list)
    all_item =  all_array[:,idx_].astype(np.int)
    return all_item

def get_my_type(all_list, people=0, idx_= -3):
    me_ = []
    you_ = []
    for item in all_list:
        if item[0] == 'Flipped':
            you_.append(item)
        else:
            me_.append(item)
    my_type = np.array(me_)[:,idx_]
    you_type = np.array(you_)[:,idx_]
    return Counter(my_type), Counter(you_type)

In [6]:
def day_freq_cal():
    day_ = [[],[]]
    for item in final_dict:
        temp_dict = final_dict[item]
        if temp_dict['name'] == 'Flipped':
            day_[0].append(temp_dict['day'])
        else:
            day_[1].append(temp_dict['day'])
    return [np.array(day_[0]).astype(np.float), np.array(day_[1]).astype(np.float)]

def hour_freq_cal():
    day_ = [[],[]]
    for item in final_dict:
        temp_dict = final_dict[item]
        if temp_dict['name'] == 'Flipped':
            day_[0].append(temp_dict['hour'])
        else:
            day_[1].append(temp_dict['hour'])
    return [np.array(day_[0]).astype(np.float), np.array(day_[1]).astype(np.float)]

In [7]:
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
plt.rc('font',family='Times New Roman')
sns.set_style("darkgrid", 
              { 'legend.frameon':True,
                'font.family':"Times New Roman",
                'axes.linewidth':2})

sns.set_context("paper", rc={"lines.linewidth":2,
                             "font.size":14,
                             "axes.labelsize":14,
                             "axes.titlesize":14,
                             "xtick.labelsize":14,
                             "ytick.labelsize":14})
sns.despine()


<matplotlib.figure.Figure at 0x7f2349e58910>

In [8]:
#day_count = get_related_list(all_list, -2)
day_count = day_freq_cal()
#print day_count.shape
plt.figure('month')
fig,ax = plt.subplots()
plt.xticks(np.arange(0,31,10)) 
plt.yticks(np.arange(0,1600,200)) 
ax.set_xlabel(r"Day")
ax.set_ylabel(r"Message Count")
labels = [u'From you to me', u'From me to you']
#ax.hist(day_count, bins = 33,range=[0,33], histtype='step', stacked=True)
ax.hist(day_count, bins = 33,  range=[0,33], stacked=True, fill=True, align='left',label=labels)
plt.legend(loc='upper left', shadow =True, fontsize='small')
ax.set_ylim(0,1600)
plt.savefig('day_frequncy.jpg',bbox_inches='tight',dpi=300)


/usr/lib/python2.7/dist-packages/matplotlib/font_manager.py:1288: UserWarning: findfont: Font family [u'Times New Roman'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))
<matplotlib.figure.Figure at 0x7f2348115a10>

In [9]:
hour_count = hour_freq_cal()
plt.figure('month')
fig,ax = plt.subplots()
plt.xticks(np.arange(0,25,6)) 
ax.set_xlabel(r"Hour")
ax.set_ylabel(r"Message Count")
#plt.yticks(np.arange(400,1400,200)) 
labels = [u'From you to me', u'From me to you']
ax.hist(hour_count, bins = 24,range=[0,24],stacked=True, fill=True, align='left', label=labels)
#ax.set_ylim(400,1400)
plt.legend(loc='upper left', shadow =True, fontsize='small')
plt.savefig('hour_frequncy.jpg',bbox_inches='tight',dpi=300)


<matplotlib.figure.Figure at 0x7f2349e58f10>

In [10]:
colors = ["windows blue", "amber", "greyish", "faded green", "dusty purple"]


t_1, t_2 = get_my_type(all_list)
t_1_v = t_1.values()
t_1_v.remove(max(t_1.values()))
t_1_k = t_1.keys()
t_1_k.remove("Text")
print t_1_v, t_1_k
t_1['others'] = 36
text_list = ['Emoticon',  'Photos', 'Webpages', 'Videocall']
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
# labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
# sizes = [15, 30, 45, 10]
#explode = (0, 0.1, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')
explode = (0.15, 0.13, 0.11, 0.08) 

fig1, ax1 = plt.subplots()
ax1.pie([t_1[ind_] for ind_ in text_list ], explode = explode, colors = [sns.xkcd_rgb[item] for item in colors], labels=text_list, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.savefig('pie_me.jpg',bbox_inches='tight',dpi=300)


[1699, 145, 23, 719, 1, 8, 195, 4] ['Emoticon', 'Webpages', 'Videos', 'Photos', 'Location', 'Sight', 'Videocall', 'Audio']

In [11]:
colors = ["windows blue", "amber", "greyish", "faded green", "dusty purple"]


t_1, t_2 = get_my_type(all_list)
t_1_v = t_2.values()
t_1_v.remove(max(t_2.values()))
t_1_k = t_2.keys()
t_1_k.remove("Text")
print t_1_v, t_1_k
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
# labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
# sizes = [15, 30, 45, 10]
explode = (0.15, 0.14, 0.13, 0.12, 0.08)  # only "explode" the 2nd slice (i.e. 'Hogs')

text_list = ['Emoticon', 'Photos', 'Webpages', 'Videocall',  'Systeminfo',]

fig1, ax1 = plt.subplots()
ax1.pie([t_2[ind_] for ind_ in text_list ], colors = [sns.xkcd_rgb[item] for item in colors], explode=explode, labels=text_list, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.savefig('pie_you.jpg',bbox_inches='tight',dpi=300)


[1838, 7, 12, 4, 313, 1, 82, 103, 200, 1] ['Emoticon', '', 'Sight', 'Videos', 'Photos', 'Location', 'Webpages', 'Systeminfo', 'Videocall', 'Audio']

Time of voice call


In [13]:
def voice_time():
    day_ = [[],[]]
    for item in final_dict:
        temp_dict = final_dict[item]
        if temp_dict['type'] == 'Videocall':
            if temp_dict['name'] == 'Flipped':
                day_[0].append(temp_dict['hour'])
            else:
                day_[1].append(temp_dict['hour'])
    return [np.array(day_[0]).astype(np.float), np.array(day_[1]).astype(np.float)]

In [14]:
voice_return = voice_time()
plt.figure('voice hour')
fig,ax = plt.subplots()
plt.xticks(np.arange(0,25,6)) 
ax.set_xlabel(r"Voice Time")
ax.set_ylabel(r"Message Count")
#plt.yticks(np.arange(400,1400,200)) 
labels = [u'From you to me', u'From me to you']
ax.hist(voice_return, bins = 24,range=[0,24],stacked=True, fill=True, align='left', label=labels)
#ax.set_ylim(400,1400)
plt.legend(loc='upper left', shadow =True, fontsize='small')
plt.savefig('voice_hour_frequncy.jpg',bbox_inches='tight',dpi=300)


<matplotlib.figure.Figure at 0x7f23885dfa10>

In [142]:
import jieba
jieba.add_word('孟老师')
jieba.add_word('磊哥哥')
jieba.add_word('二货')
jieba.add_word('童童')
jieba.add_word('抱一下')
jieba.add_word('老司机')
jieba.add_word('大神')
jieba.add_word('代码')
jieba.add_word('智障')
jieba.add_word('笑点')

In [11]:
you_text = []
you_whole_text= ""
my_text = []
my_whole_text=""
for item_ in final_dict:
    temp_dict_ = final_dict[item_]
    if temp_dict_['type'] == 'Text':
        temp_text = temp_dict_['text'].decode('utf8')
        cut_text = "".join([i for i in temp_dict_['text'].decode('utf8') if not (i.isdigit() or i in u",.@:#&【】;(),。、:!/\!_[],.:~")])
        seg_list = jieba.cut(cut_text, cut_all=False)
        sentence =  " ".join(seg_list)
        if temp_dict_['name'] == 'Flipped':
            you_text += sentence.split(" ")
            you_whole_text+=temp_text
        else:
            my_text += sentence.split(" ")
            my_whole_text+=temp_text
import operator my_counter = Counter(my_text) sorted_x = sorted(my_counter.items(), key=operator.itemgetter(1)) print sorted_x[-2][0] for item in sorted_x[:-400:-1]: if len(item[0])>1: print item[0], item[1]
import operator my_counter = Counter(you_text) sorted_x = sorted(my_counter.items(), key=operator.itemgetter(1)) print sorted_x[-2][0] for item in sorted_x[:-400:-1]: if len(item[0])>1: print item[0], item[1]

In [12]:
import jieba.analyse
jieba.add_word('孟老师')
jieba.add_word('磊哥哥')
jieba.add_word('二货')
jieba.add_word('童童')
jieba.add_word('抱一下')
jieba.add_word('老司机')
jieba.add_word('大神')
jieba.add_word('代码')
jieba.add_word('智障')
jieba.add_word('笑点')
jieba.add_word('小二货')

#jieba.analyse.set_stop_words('/Users/liujinjian/Desktop/stop_words.txt') 
tags = jieba.analyse.extract_tags(you_whole_text, topK=100, withWeight=True)
for v, n in tags:
    print (v + '\t' + str(int(n * 10000)))


磊哥哥	10347
晚安	1569
老司机	793
我要	553
宝宝	538
哈哈	512
安安	405
Chuckle	380
什么	378
今天	378
抱抱	375
童童	373
青海	358
知道	347
明天	345
哈哈哈	304
没有	299
怎么	289
洗澡	255
梦到	252
不是	246
好像	225
可以	221
这么	209
这个	208
早安	208
喜欢	199
为什么	195
真的	189
睡觉	187
感觉	187
不想	175
特别	168
记得	167
师兄	165
就是	164
不要	164
开心	163
好帅	156
是不是	153
一下	153
好看	152
觉得	149
反正	146
那个	146
时候	146
厉害	145
好吃	143
因为	143
刘老师	143
开车	142
果然	141
不行	141
一会	137
还有	137
不会	134
昨天	133
早上好	131
哪里	131
你好	130
打字	129
why	128
看到	125
应该	124
姐姐	124
可能	120
吃饭	120
大神	118
原来	117
孟老师	117
bossgirl	117
一个	117
睡着	114
实验室	112
我困	111
脱兔	111
嘻嘻	109
明明	108
刚才	107
宿舍	105
这样	105
老板	104
好慢	104
好好看	102
一定	102
自己	102
撩妹	100
可是	99
这是	98
不好	98
但是	97
春儿	97
洗手间	96
看看	96
噗嗤	95
no	95
好好	94
师姐	93
爬山	91
所以	91