notebook.community

Edit and run



In [1]:

    
import time
import pandas as pd
import numpy as np
import cPickle as pickle
import random



In [ ]:

    
start = time.time()

with open('data/c_pairs_anon_scored.feb') as f:
    content = f.read().splitlines()
end = time.time()

end - start



In [ ]:

    
unique_paths = set()
pairs = []
paths = []
meetings = []

start = time.time()
for i in xrange(len(content)):
    if i % 10000000 == 0:
        print i, ': ', time.time()-start
    (id1, id2, path, count, numMeeting) = content[i].split('|')
    pairs.append(''.join(sorted(list([id1, id2]))))
    paths.append(path)
    meetings.append(int(numMeeting))
unique_paths = set(paths)
end = time.time()
end - start



In [ ]:

    
start = time.time()
pdMeetings = pd.Series(meetings, index=pairs, dtype=np.uint8)
end = time.time()
end - start



In [ ]:

    
start = time.time()
pdPaths = pd.Series(paths, index=pairs)
end = time.time()
end - start



In [ ]:

    
start = time.time()
feats = pd.get_dummies(pdPaths, sparse=True)
end = time.time()
print end - start



In [ ]:

    
start = time.time()
combinedFeats = pd.concat([feats, pdMeetings], axis=1).sample(frac=0.005)
end = time.time()
print end - start

# combinedFeats.shape
# (393169, 170)
# %xdel combinedFeats



In [ ]:

    
# relabel
list1 = combinedFeats.columns.tolist()[:(len(combinedFeats.columns)-1)]
list2 = ['numOfMeetings']
list1.extend(list2)
combinedFeats.columns = list1



In [ ]:

    
# read February t_space data to create label vector
start = time.time()

with open('data/t_pairs_anon_verbs.feb.txt') as f:
    content = f.read().splitlines()
end = time.time()

end - start



In [ ]:

    
t_unique_paths = set()
t_pairs = []
t_paths = []
t_interactions = []

start = time.time()
for i in xrange(len(content)):
    (id1, id2, verb, path, count, numInteraction) = content[i].split('|')
    t_pairs.append(''.join(sorted(list([id1, id2]))))
    t_paths.append(path)
    t_interactions.append(int(numInteraction))
t_unique_paths = set(t_paths)
end = time.time()
end - start



In [ ]:

    
# remove dups
ut_pairs = []
ut_paths = []

start = time.time()
for i in xrange(len(t_pairs)):
    if t_pairs[i] not in ut_pairs:
        ut_pairs.append(t_pairs[i])
        ut_paths.append(t_paths[i])
end = time.time()
print end - start



In [ ]:

    
start = time.time()
pdTPaths = pd.Series(ut_paths, index=ut_pairs)
end = time.time()
end - start



In [ ]:

    
start = time.time()
t_feats = pd.get_dummies(pdTPaths, sparse=True)
end = time.time()
print end - start



In [ ]:

    
# assign label '1' to all t_ examples
t_labels = pd.Series(1, index=ut_pairs, dtype=np.uint8)



In [ ]:

    
# combine and relabel
start = time.time()
t_combinedFeats = pd.concat([t_feats, t_labels], axis=1)
end = time.time()
print end - start

list1 = t_combinedFeats.columns.tolist()[:(len(t_combinedFeats.columns)-1)]
list2 = ['label']
list1.extend(list2)
t_combinedFeats.columns = list1



In [ ]:

    
# combine c_ and t_ examples (c_ contains mostly negative, t_ contains positive)
start = time.time()
combinedFeats = combinedFeats.to_dense()
t_combinedFeats = t_combinedFeats.to_dense()
training = pd.concat([combinedFeats, t_combinedFeats])
end = time.time()
end - start



In [ ]:

    
# remove c_ and t_ overlapping ex, fill na
training = training.reset_index()
training = training[~training['index'].duplicated(keep='first')]
training = training.fillna(0)
training.index = training['index']
# drop column
training.drop(['index'], inplace=True,axis=1,errors='ignore')
training = training.to_dense()



In [ ]:

    
# recover num of meeting for positive training examples
start = time.time()

positiveTrainingEx = training.loc[training.label==1,:].index.intersection(pdMeetings.index)
for hashId in positiveTrainingEx:
    training.at[hashId, 'numOfMeetings'] = float(pdMeetings.at[hashId])

end = time.time()
end - start



In [ ]:

    
# trim negative examples a little more
removes = []

start = time.time()
for hashId in training.index:
    if training.at[hashId, 'label'] == 0:
        if random.random() < 0.7:
            removes.append(hashId)
end = time.time()
end - start

training.drop(removes, axis=0, inplace=True)



In [ ]:

    
# export to file
start = time.time()
pickle.dump(training, open('trainFinal.p', 'wb'))
end = time.time()
end - start



In [2]:

    
start = time.time()
training = pickle.load(open('trainFinal.p', 'rb'))
end = time.time()
end - start









    Out[2]:





74.06825017929077



In [ ]:



In [3]:

    
# creating test set from c_ for Mar
start = time.time()
with open('data/c_uniq_pairs_anon_scored.mar') as f:
    content = f.read().splitlines()
end = time.time()
end - start

unique_paths = set()
pairs = []
paths = []
meetings = []

start = time.time()
for i in xrange(len(content)):
    if i % 10000000 == 0:
        print i, ': ', time.time()-start
    (id1, id2, path, count, numMeeting) = content[i].split('|')
    pairs.append(''.join(sorted(list([id1, id2]))))
    paths.append(path)
    meetings.append(int(numMeeting))
unique_paths = set(paths)
end = time.time()
end - start

start = time.time()
pdMeetings = pd.Series(meetings, index=pairs, dtype=np.uint8)
end = time.time()
end - start

start = time.time()
pdPaths = pd.Series(paths, index=pairs)
end = time.time()
end - start

start = time.time()
feats = pd.get_dummies(pdPaths, sparse=True)
end = time.time()
print end - start

start = time.time()
combinedFeatsT = pd.concat([feats, pdMeetings], axis=1).sample(frac=0.001)
end = time.time()
print end - start

list1 = combinedFeatsT.columns.tolist()[:(len(combinedFeatsT.columns)-1)]
list2 = ['numOfMeetings']
list1.extend(list2)
combinedFeatsT.columns = list1









    



0 :  0.000339984893799
10000000 :  30.0450561047
64.7921299934
22.7406461239



In [4]:

    
# getting t_ examples for Mar
start = time.time()
with open('data/t_pairs_anon.mar') as f:
    content = f.read().splitlines()
end = time.time()
end - start

t_unique_paths = set()
t_pairs = []
t_paths = []
t_interactions = []

start = time.time()
for i in xrange(len(content)):
    (id1, id2, path, count, numInteraction) = content[i].split('|')
    t_pairs.append(''.join(sorted(list([id1, id2]))))
    t_paths.append(path)
    t_interactions.append(int(numInteraction))
t_unique_paths = set(t_paths)
end = time.time()
end - start

ut_pairs = []
ut_paths = []

start = time.time()
for i in xrange(len(t_pairs)):
    if t_pairs[i] not in ut_pairs:
        ut_pairs.append(t_pairs[i])
        ut_paths.append(t_paths[i])
end = time.time()
print end - start

start = time.time()
pdTPaths = pd.Series(ut_paths, index=ut_pairs)
end = time.time()
end - start

start = time.time()
t_feats = pd.get_dummies(pdTPaths, sparse=True)
end = time.time()

# assign label '1' to all t_ examples
t_labels = pd.Series(1, index=ut_pairs, dtype=np.uint8)

# combine and relabel
start = time.time()
t_combinedFeatsT = pd.concat([t_feats, t_labels], axis=1)
end = time.time()
print end - start

list1 = t_combinedFeatsT.columns.tolist()[:(len(t_combinedFeatsT.columns)-1)]
list2 = ['label']
list1.extend(list2)
t_combinedFeatsT.columns = list1

print end - start









    



0.147927045822
0.0137979984283
0.0137979984283



In [5]:

    
# remove the c_ examples that are in t_
start = time.time()
overlap = combinedFeatsT.index.intersection(t_feats.index)
ins = []

for i in xrange(len(combinedFeatsT)):
    if combinedFeatsT.index[i] in overlap:
        ins.append(False)
    else:
        ins.append(True)

smallFeats = combinedFeatsT.loc[ins, :].to_dense()        

end = time.time()
end - start









    Out[5]:





0.11556601524353027



In [6]:

    
# combine c_ and t_ for Mar
start = time.time()
smallFeats2 = smallFeats.to_dense()
t_feats2 = t_combinedFeatsT.to_dense()
testing = pd.concat([smallFeats2, t_feats2])
end = time.time()
end - start









    Out[6]:





0.04295992851257324



In [7]:

    
# combine with a sample row from training to sync up on columns
a = training[:1]
testing = pd.concat([testing, a])
testing.drop(a.index, inplace=True)

# reindex, fill na, maintenance
testing = testing.reset_index()
testing = testing[~testing['index'].duplicated(keep='first')]
testing = testing.fillna(0)
testing.index = testing['index']
testing.drop(['index'], inplace=True,axis=1,errors='ignore')



In [8]:

    
# remove redundant columns
start = time.time()

cols = []
for col in testing.columns:
    if col not in training.columns:
        cols.append(col)

testing.drop(cols, inplace=True,axis=1,errors='ignore')

end = time.time()
end - start









    Out[8]:





0.012418985366821289



In [12]:

    
start = time.time()
pickle.dump(testing, open('testFinal.p', 'wb'))
end = time.time()
end - start









    Out[12]:





3.1305899620056152



In [ ]: