In [1]:
import pymongo, json, matplotlib
client2 = pymongo.MongoClient('goto.reproducible.work')
pazans = json.loads(open('/home/oleg/coding/go-to-hack-main/share/pazan_publs.json').read())
users = {}
st = set()
for s in pazans.items():
st.add(s[0])
for l in open('/home/oleg/coding/go-to-hack-main/share/source_data/users.json'):
us = json.loads(l)
if str(us['_id']) in st:
users[us['_id']] = us
users[us['_id']]['groups'] = pazans[str(us['_id'])]
In [7]:
import pandas
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rc('font', family='DejaVu sans', size='50')
#print(lst[103])
In [3]:
data = pandas.DataFrame.from_dict(users,orient='index')
lst = list(users.items())
good_users = list(filter(lambda a: len(a[1]['groups'])>1, lst))
print(len(users))
print(len(good_users))
In [8]:
plt.figure(figsize=(50.0, 20.0))
plt.hist(data.sex, bins=2, range=(1, 2))
plt.savefig('gender.png')
plt.xlabel("Пол")
plt.ylabel("Количество пацанов")
plt.show()
Первая гипотеза опровергнута
In [69]:
x = range(1, 10)
y = [len(list(filter(lambda a: len(a[1]['groups'])>=i, lst))) for i in x]
y1 = [len(list(filter(lambda a: len(a[1]['groups'])==i and a[1]['sex']==1, lst)))/36275 for i in x]
y2 = [len(list(filter(lambda a: len(a[1]['groups'])==i and a[1]['sex']==2, lst)))/36275 for i in x]
# plt.plot(x,y, label='общее')
# plt.plot(x,y1, label='нетру пацаны (девочки)')
# plt.plot(x,y2, label='тру пацаны (мальчики)')
import matplotlib.gridspec as gridspec
#f = plt.figure()
f, ax = plt.subplots()
ax.stackplot(x, y2, y1, colors=('#add8e6', 'pink'), )
f.set_figheight(20)
f.set_figwidth(50)
plt.xlabel("Чёткость (precision)")
plt.ylabel("Количество людей")
plt.legend(['мальчики', 'девочки'])
plt.savefig('groups.png')
plt.show()
In [45]:
x = range(1, 6)
y1 = [len(list(filter(lambda a: ('personal' in a[1]) and
('alcohol' in a[1]['personal']) and a[1]['personal']['alcohol']==i, lst))) for i in x]
y2 = [len(list(filter(lambda a: ('personal' in a[1]) and
('smoking' in a[1]['personal']) and a[1]['personal']['smoking']==i, lst))) for i in x]
#plt.figure(figsize=(20.0, 20.0))
plt.plot(x,y1, label='alcohol')
plt.plot(x,y2, label='smoking')
plt.legend()
plt.figure(figsize=(20.0, 20.0))
plt.show()
In [138]:
#print(json.dumps(lst[0], encoding='utf-8'))
keywords = {
#'падик': 0,
'адик': 0,
#'найк': 0,
#'adidas': 0,
'nike': 0,
'брат': 0,
'пацан': 0,
'пацан': 0,
'мама': 0,
#'двор': 0,
'учеба': 0,
'вообще': 0,
#'искусство': 0,
#'бухл': 0,
}
puper_list = lst[0:1000]
cnt = 0
print(client2['vk'].collection_names())
walls = client2['vk']['walls']
print()
for u in puper_list:
string = str(u).lower()
wall_str = walls.find_one({'_id': u[1]['_id']})
azaza = False
if 'response' in wall_str:
azaza = True
wall_str = wall_str['response']['items']
#print(wall_str)
#wall_str = str(wall_str)
#print(wall_str)
#vk_login, vk_password = sys.argv[1], sys.argv[2]
if(cnt%100==0):
print(cnt)
cnt+=1
for w in keywords:
if (w in string):
keywords[w]+=1
else:
for wall_post in wall_str:
if (w in wall_post['text'].lower()):
keywords[w]+=1
print(keywords)
In [139]:
plt.figure(figsize=(50.0, 20.0))
ad = 0
items = sorted(keywords.items(), key=lambda a: a[1])
keys = [a[0] for a in items]
values = [a[1]/cnt for a in items]
plt.bar(range(len(values)), values, align='center')
plt.xticks(range(len(keys)), keys)
plt.savefig('words.png')
plt.show()
In [88]:
plt.figure(figsize=(50.0, 20.0))
groups_alc = {}
cnt_alc = {}
groups_smo = {}
cnt_smo = {}
r_gr = range(0, 8)
for i in r_gr:
groups_alc[i] = 0
cnt_alc[i] = 0
groups_smo[i] = 0
cnt_smo[i] = 0
for u in lst:
if len(u[1]['groups']) in r_gr:
if ('personal' in u[1]) and ('alcohol' in u[1]['personal']):
groups_alc[len(u[1]['groups'])]+=u[1]['personal']['alcohol']
cnt_alc[len(u[1]['groups'])]+=1
if ('personal' in u[1]) and ('smoking' in u[1]['personal']):
groups_smo[len(u[1]['groups'])]+=u[1]['personal']['smoking']
cnt_smo[len(u[1]['groups'])]+=1
for i in r_gr:
if cnt_alc[i]!=0:
groups_alc[i] /= cnt_alc[i]
if cnt_smo[i]!=0:
groups_smo[i] /= cnt_smo[i]
groups_alc[0] = groups_smo[0] = 1.8
print(groups_alc)
plt.plot(r_gr,list(groups_alc.values()), linewidth=13,label='Бухлишко')
plt.plot(r_gr,list(groups_smo.values()), linewidth=13,label='Сиги')
# plt.scatter(x,y, alpha = 0.01) #изменения только в этой строчке
plt.xlabel("Группы")
plt.ylabel("Матожидание отношения")
plt.legend()
plt.savefig('expect.png')
plt.show()
In [334]:
usCol = client2['vk'].users
In [356]:
super_groups = {}
cnt = 0
keys = set()
for a in good_users:
keys.add(a[0])
print("Keys")
for l in open('/home/oleg/coding/go-to-hack-main/share/resulting_groups.json'):
gr = json.loads(l)
per_id = gr[0]
if per_id in keys:
for g in gr[1]:
if g not in super_groups:
super_groups[g] = 0
super_groups[g]+=1
cnt+=1
if(cnt%1000==0):
print("Done: %s: " % cnt)
super_groups = sorted(list(super_groups.items()), key=lambda a: -a[1])
print(super_groups[0:30])
In [ ]: