In [1]:
import time
import pandas as pd
import numpy as np
import cPickle as pickle
import random
In [ ]:
start = time.time()
with open('data/c_pairs_anon_scored.feb') as f:
content = f.read().splitlines()
end = time.time()
end - start
In [ ]:
unique_paths = set()
pairs = []
paths = []
meetings = []
start = time.time()
for i in xrange(len(content)):
if i % 10000000 == 0:
print i, ': ', time.time()-start
(id1, id2, path, count, numMeeting) = content[i].split('|')
pairs.append(''.join(sorted(list([id1, id2]))))
paths.append(path)
meetings.append(int(numMeeting))
unique_paths = set(paths)
end = time.time()
end - start
In [ ]:
start = time.time()
pdMeetings = pd.Series(meetings, index=pairs, dtype=np.uint8)
end = time.time()
end - start
In [ ]:
start = time.time()
pdPaths = pd.Series(paths, index=pairs)
end = time.time()
end - start
In [ ]:
start = time.time()
feats = pd.get_dummies(pdPaths, sparse=True)
end = time.time()
print end - start
In [ ]:
start = time.time()
combinedFeats = pd.concat([feats, pdMeetings], axis=1).sample(frac=0.005)
end = time.time()
print end - start
# combinedFeats.shape
# (393169, 170)
# %xdel combinedFeats
In [ ]:
# relabel
list1 = combinedFeats.columns.tolist()[:(len(combinedFeats.columns)-1)]
list2 = ['numOfMeetings']
list1.extend(list2)
combinedFeats.columns = list1
In [ ]:
# read February t_space data to create label vector
start = time.time()
with open('data/t_pairs_anon_verbs.feb.txt') as f:
content = f.read().splitlines()
end = time.time()
end - start
In [ ]:
t_unique_paths = set()
t_pairs = []
t_paths = []
t_interactions = []
start = time.time()
for i in xrange(len(content)):
(id1, id2, verb, path, count, numInteraction) = content[i].split('|')
t_pairs.append(''.join(sorted(list([id1, id2]))))
t_paths.append(path)
t_interactions.append(int(numInteraction))
t_unique_paths = set(t_paths)
end = time.time()
end - start
In [ ]:
# remove dups
ut_pairs = []
ut_paths = []
start = time.time()
for i in xrange(len(t_pairs)):
if t_pairs[i] not in ut_pairs:
ut_pairs.append(t_pairs[i])
ut_paths.append(t_paths[i])
end = time.time()
print end - start
In [ ]:
start = time.time()
pdTPaths = pd.Series(ut_paths, index=ut_pairs)
end = time.time()
end - start
In [ ]:
start = time.time()
t_feats = pd.get_dummies(pdTPaths, sparse=True)
end = time.time()
print end - start
In [ ]:
# assign label '1' to all t_ examples
t_labels = pd.Series(1, index=ut_pairs, dtype=np.uint8)
In [ ]:
# combine and relabel
start = time.time()
t_combinedFeats = pd.concat([t_feats, t_labels], axis=1)
end = time.time()
print end - start
list1 = t_combinedFeats.columns.tolist()[:(len(t_combinedFeats.columns)-1)]
list2 = ['label']
list1.extend(list2)
t_combinedFeats.columns = list1
In [ ]:
# combine c_ and t_ examples (c_ contains mostly negative, t_ contains positive)
start = time.time()
combinedFeats = combinedFeats.to_dense()
t_combinedFeats = t_combinedFeats.to_dense()
training = pd.concat([combinedFeats, t_combinedFeats])
end = time.time()
end - start
In [ ]:
# remove c_ and t_ overlapping ex, fill na
training = training.reset_index()
training = training[~training['index'].duplicated(keep='first')]
training = training.fillna(0)
training.index = training['index']
# drop column
training.drop(['index'], inplace=True,axis=1,errors='ignore')
training = training.to_dense()
In [ ]:
# recover num of meeting for positive training examples
start = time.time()
positiveTrainingEx = training.loc[training.label==1,:].index.intersection(pdMeetings.index)
for hashId in positiveTrainingEx:
training.at[hashId, 'numOfMeetings'] = float(pdMeetings.at[hashId])
end = time.time()
end - start
In [ ]:
# trim negative examples a little more
removes = []
start = time.time()
for hashId in training.index:
if training.at[hashId, 'label'] == 0:
if random.random() < 0.7:
removes.append(hashId)
end = time.time()
end - start
training.drop(removes, axis=0, inplace=True)
In [ ]:
# export to file
start = time.time()
pickle.dump(training, open('trainFinal.p', 'wb'))
end = time.time()
end - start
In [2]:
start = time.time()
training = pickle.load(open('trainFinal.p', 'rb'))
end = time.time()
end - start
Out[2]:
In [ ]:
In [3]:
# creating test set from c_ for Mar
start = time.time()
with open('data/c_uniq_pairs_anon_scored.mar') as f:
content = f.read().splitlines()
end = time.time()
end - start
unique_paths = set()
pairs = []
paths = []
meetings = []
start = time.time()
for i in xrange(len(content)):
if i % 10000000 == 0:
print i, ': ', time.time()-start
(id1, id2, path, count, numMeeting) = content[i].split('|')
pairs.append(''.join(sorted(list([id1, id2]))))
paths.append(path)
meetings.append(int(numMeeting))
unique_paths = set(paths)
end = time.time()
end - start
start = time.time()
pdMeetings = pd.Series(meetings, index=pairs, dtype=np.uint8)
end = time.time()
end - start
start = time.time()
pdPaths = pd.Series(paths, index=pairs)
end = time.time()
end - start
start = time.time()
feats = pd.get_dummies(pdPaths, sparse=True)
end = time.time()
print end - start
start = time.time()
combinedFeatsT = pd.concat([feats, pdMeetings], axis=1).sample(frac=0.001)
end = time.time()
print end - start
list1 = combinedFeatsT.columns.tolist()[:(len(combinedFeatsT.columns)-1)]
list2 = ['numOfMeetings']
list1.extend(list2)
combinedFeatsT.columns = list1
In [4]:
# getting t_ examples for Mar
start = time.time()
with open('data/t_pairs_anon.mar') as f:
content = f.read().splitlines()
end = time.time()
end - start
t_unique_paths = set()
t_pairs = []
t_paths = []
t_interactions = []
start = time.time()
for i in xrange(len(content)):
(id1, id2, path, count, numInteraction) = content[i].split('|')
t_pairs.append(''.join(sorted(list([id1, id2]))))
t_paths.append(path)
t_interactions.append(int(numInteraction))
t_unique_paths = set(t_paths)
end = time.time()
end - start
ut_pairs = []
ut_paths = []
start = time.time()
for i in xrange(len(t_pairs)):
if t_pairs[i] not in ut_pairs:
ut_pairs.append(t_pairs[i])
ut_paths.append(t_paths[i])
end = time.time()
print end - start
start = time.time()
pdTPaths = pd.Series(ut_paths, index=ut_pairs)
end = time.time()
end - start
start = time.time()
t_feats = pd.get_dummies(pdTPaths, sparse=True)
end = time.time()
# assign label '1' to all t_ examples
t_labels = pd.Series(1, index=ut_pairs, dtype=np.uint8)
# combine and relabel
start = time.time()
t_combinedFeatsT = pd.concat([t_feats, t_labels], axis=1)
end = time.time()
print end - start
list1 = t_combinedFeatsT.columns.tolist()[:(len(t_combinedFeatsT.columns)-1)]
list2 = ['label']
list1.extend(list2)
t_combinedFeatsT.columns = list1
print end - start
In [5]:
# remove the c_ examples that are in t_
start = time.time()
overlap = combinedFeatsT.index.intersection(t_feats.index)
ins = []
for i in xrange(len(combinedFeatsT)):
if combinedFeatsT.index[i] in overlap:
ins.append(False)
else:
ins.append(True)
smallFeats = combinedFeatsT.loc[ins, :].to_dense()
end = time.time()
end - start
Out[5]:
In [6]:
# combine c_ and t_ for Mar
start = time.time()
smallFeats2 = smallFeats.to_dense()
t_feats2 = t_combinedFeatsT.to_dense()
testing = pd.concat([smallFeats2, t_feats2])
end = time.time()
end - start
Out[6]:
In [7]:
# combine with a sample row from training to sync up on columns
a = training[:1]
testing = pd.concat([testing, a])
testing.drop(a.index, inplace=True)
# reindex, fill na, maintenance
testing = testing.reset_index()
testing = testing[~testing['index'].duplicated(keep='first')]
testing = testing.fillna(0)
testing.index = testing['index']
testing.drop(['index'], inplace=True,axis=1,errors='ignore')
In [8]:
# remove redundant columns
start = time.time()
cols = []
for col in testing.columns:
if col not in training.columns:
cols.append(col)
testing.drop(cols, inplace=True,axis=1,errors='ignore')
end = time.time()
end - start
Out[8]:
In [12]:
start = time.time()
pickle.dump(testing, open('testFinal.p', 'wb'))
end = time.time()
end - start
Out[12]:
In [ ]: