In [2]:
#!/usr/bin/env pythonPython
# coding=utf-8
%matplotlib inline
import MySQLdb
import jieba
import numpy as np
import os,sys
import json
import matplotlib
from collections import Counter
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
In [3]:
with open('./Flipped.txt','r') as f:
all_files = f.readlines()
datalist = []
for item in all_files:
item_split = item.split()
if len(item_split)>3:
datalist.append(item)
all_data=datalist[2:]
In [4]:
final_dict = {}
all_list=[]
for i, item in enumerate(all_data):
temp_dict = {}
temp_list = []
try:
s = item[30:49]
time_stamp = time.mktime(datetime.datetime.strptime(s, "%m/%d/%Y %H:%M:%S").timetuple())
except ValueError:
print item
temp_dict['time_stamp'] = np.float32(time_stamp)
temp_dict['month'] = np.int(item[30:32])
temp_dict['day'] = np.int(item[33:35])
temp_dict['year'] = np.int(item[36:40])
temp_dict['hour'] = np.int(item[41:43])
temp_dict['minute'] = np.int(item[44:46])
temp_dict['name'] = item[55:80].replace(" ","")
temp_dict['receive'] = item[80:100].replace(" ","")
temp_dict['type'] = item[100:120].replace(" ","")
temp_dict['text'] = item[120:].replace(" ","")
for key, value in temp_dict.iteritems():
temp_list.append(value)
all_list.append(temp_list)
final_dict[i] = temp_dict
print final_dict[0]
print all_list[0]
#all_data[20][0][55:]
#print all_data[20][0][80:]
#print all_data[20][0][100:]
#print all_data[20][0][120:]
In [4]:
#with open('result.json', 'w') as fp:
#json.dump(final_dict, fp)
print len(all_list)
In [5]:
def get_frequency(all_list, idx_=-2):
all_array = np.array(all_list)
all_item = all_array[:,idx_].astype(np.int)
counts = np.bincount(all_item)
return counts
def get_related_list(all_list, idx_=-2):
all_array = np.array(all_list)
all_item = all_array[:,idx_].astype(np.int)
return all_item
def get_my_type(all_list, people=0, idx_= -3):
me_ = []
you_ = []
for item in all_list:
if item[0] == 'Flipped':
you_.append(item)
else:
me_.append(item)
my_type = np.array(me_)[:,idx_]
you_type = np.array(you_)[:,idx_]
return Counter(my_type), Counter(you_type)
In [6]:
def day_freq_cal():
day_ = [[],[]]
for item in final_dict:
temp_dict = final_dict[item]
if temp_dict['name'] == 'Flipped':
day_[0].append(temp_dict['day'])
else:
day_[1].append(temp_dict['day'])
return [np.array(day_[0]).astype(np.float), np.array(day_[1]).astype(np.float)]
def hour_freq_cal():
day_ = [[],[]]
for item in final_dict:
temp_dict = final_dict[item]
if temp_dict['name'] == 'Flipped':
day_[0].append(temp_dict['hour'])
else:
day_[1].append(temp_dict['hour'])
return [np.array(day_[0]).astype(np.float), np.array(day_[1]).astype(np.float)]
In [7]:
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
plt.rc('font',family='Times New Roman')
sns.set_style("darkgrid",
{ 'legend.frameon':True,
'font.family':"Times New Roman",
'axes.linewidth':2})
sns.set_context("paper", rc={"lines.linewidth":2,
"font.size":14,
"axes.labelsize":14,
"axes.titlesize":14,
"xtick.labelsize":14,
"ytick.labelsize":14})
sns.despine()
In [8]:
#day_count = get_related_list(all_list, -2)
day_count = day_freq_cal()
#print day_count.shape
plt.figure('month')
fig,ax = plt.subplots()
plt.xticks(np.arange(0,31,10))
plt.yticks(np.arange(0,1600,200))
ax.set_xlabel(r"Day")
ax.set_ylabel(r"Message Count")
labels = [u'From you to me', u'From me to you']
#ax.hist(day_count, bins = 33,range=[0,33], histtype='step', stacked=True)
ax.hist(day_count, bins = 33, range=[0,33], stacked=True, fill=True, align='left',label=labels)
plt.legend(loc='upper left', shadow =True, fontsize='small')
ax.set_ylim(0,1600)
plt.savefig('day_frequncy.jpg',bbox_inches='tight',dpi=300)
In [9]:
hour_count = hour_freq_cal()
plt.figure('month')
fig,ax = plt.subplots()
plt.xticks(np.arange(0,25,6))
ax.set_xlabel(r"Hour")
ax.set_ylabel(r"Message Count")
#plt.yticks(np.arange(400,1400,200))
labels = [u'From you to me', u'From me to you']
ax.hist(hour_count, bins = 24,range=[0,24],stacked=True, fill=True, align='left', label=labels)
#ax.set_ylim(400,1400)
plt.legend(loc='upper left', shadow =True, fontsize='small')
plt.savefig('hour_frequncy.jpg',bbox_inches='tight',dpi=300)
In [10]:
colors = ["windows blue", "amber", "greyish", "faded green", "dusty purple"]
t_1, t_2 = get_my_type(all_list)
t_1_v = t_1.values()
t_1_v.remove(max(t_1.values()))
t_1_k = t_1.keys()
t_1_k.remove("Text")
print t_1_v, t_1_k
t_1['others'] = 36
text_list = ['Emoticon', 'Photos', 'Webpages', 'Videocall']
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
# labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
# sizes = [15, 30, 45, 10]
#explode = (0, 0.1, 0, 0) # only "explode" the 2nd slice (i.e. 'Hogs')
explode = (0.15, 0.13, 0.11, 0.08)
fig1, ax1 = plt.subplots()
ax1.pie([t_1[ind_] for ind_ in text_list ], explode = explode, colors = [sns.xkcd_rgb[item] for item in colors], labels=text_list, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.savefig('pie_me.jpg',bbox_inches='tight',dpi=300)
In [11]:
colors = ["windows blue", "amber", "greyish", "faded green", "dusty purple"]
t_1, t_2 = get_my_type(all_list)
t_1_v = t_2.values()
t_1_v.remove(max(t_2.values()))
t_1_k = t_2.keys()
t_1_k.remove("Text")
print t_1_v, t_1_k
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
# labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
# sizes = [15, 30, 45, 10]
explode = (0.15, 0.14, 0.13, 0.12, 0.08) # only "explode" the 2nd slice (i.e. 'Hogs')
text_list = ['Emoticon', 'Photos', 'Webpages', 'Videocall', 'Systeminfo',]
fig1, ax1 = plt.subplots()
ax1.pie([t_2[ind_] for ind_ in text_list ], colors = [sns.xkcd_rgb[item] for item in colors], explode=explode, labels=text_list, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.savefig('pie_you.jpg',bbox_inches='tight',dpi=300)
In [13]:
def voice_time():
day_ = [[],[]]
for item in final_dict:
temp_dict = final_dict[item]
if temp_dict['type'] == 'Videocall':
if temp_dict['name'] == 'Flipped':
day_[0].append(temp_dict['hour'])
else:
day_[1].append(temp_dict['hour'])
return [np.array(day_[0]).astype(np.float), np.array(day_[1]).astype(np.float)]
In [14]:
voice_return = voice_time()
plt.figure('voice hour')
fig,ax = plt.subplots()
plt.xticks(np.arange(0,25,6))
ax.set_xlabel(r"Voice Time")
ax.set_ylabel(r"Message Count")
#plt.yticks(np.arange(400,1400,200))
labels = [u'From you to me', u'From me to you']
ax.hist(voice_return, bins = 24,range=[0,24],stacked=True, fill=True, align='left', label=labels)
#ax.set_ylim(400,1400)
plt.legend(loc='upper left', shadow =True, fontsize='small')
plt.savefig('voice_hour_frequncy.jpg',bbox_inches='tight',dpi=300)
In [142]:
import jieba
jieba.add_word('孟老师')
jieba.add_word('磊哥哥')
jieba.add_word('二货')
jieba.add_word('童童')
jieba.add_word('抱一下')
jieba.add_word('老司机')
jieba.add_word('大神')
jieba.add_word('代码')
jieba.add_word('智障')
jieba.add_word('笑点')
In [11]:
you_text = []
you_whole_text= ""
my_text = []
my_whole_text=""
for item_ in final_dict:
temp_dict_ = final_dict[item_]
if temp_dict_['type'] == 'Text':
temp_text = temp_dict_['text'].decode('utf8')
cut_text = "".join([i for i in temp_dict_['text'].decode('utf8') if not (i.isdigit() or i in u",.@:#&【】;(),。、:!/\!_[],.:~")])
seg_list = jieba.cut(cut_text, cut_all=False)
sentence = " ".join(seg_list)
if temp_dict_['name'] == 'Flipped':
you_text += sentence.split(" ")
you_whole_text+=temp_text
else:
my_text += sentence.split(" ")
my_whole_text+=temp_text
In [12]:
import jieba.analyse
jieba.add_word('孟老师')
jieba.add_word('磊哥哥')
jieba.add_word('二货')
jieba.add_word('童童')
jieba.add_word('抱一下')
jieba.add_word('老司机')
jieba.add_word('大神')
jieba.add_word('代码')
jieba.add_word('智障')
jieba.add_word('笑点')
jieba.add_word('小二货')
#jieba.analyse.set_stop_words('/Users/liujinjian/Desktop/stop_words.txt')
tags = jieba.analyse.extract_tags(you_whole_text, topK=100, withWeight=True)
for v, n in tags:
print (v + '\t' + str(int(n * 10000)))