In [1]:
import pandas as pd
#dataframes
df_dev_train = pd.read_csv('/Users/agupta/Downloads/device-data-sets/dev_train_basic.csv', header=0)
df_cookies = pd.read_csv('/Users/agupta/Downloads/device-data-sets/cookie_all_basic.csv', header=0)
#df_id_all_property = pd.read_csv('/Users/agupta/Downloads/device-data-sets/id_all_property.csv', header=0)
#df_id_all_ip = pd.read_csv('/Users/agupta/Downloads/device-data-sets/id_all_ip.csv', header=0)
In [18]:
import re
from operator import itemgetter
import csv
In [9]:
df_dev_test = pd.read_csv('/Users/agupta/Downloads/device-data-sets/dev_test_basic.csv', header=0)
In [11]:
print len(df_dev_test)
In [62]:
df_dev_test_device = df_dev_test['device_id']
In [59]:
f = open('/Users/agupta/Downloads/device-data-sets/id_all_ip.csv')
devices_rows = list()
cookies_rows = list()
r = re.compile('(\(\w+,\w+,\w+,\w+,\w+,\w+,\w+\))')
r2 = re.compile(',')
title = ['id', 'ip']
for t, i in enumerate(f):
if t == 0:
continue
row = list()
i = i.strip().split('{')
row = i[0].split(',')[: -1]
if row[1] == '0':
continue
ips = r.findall(i[1][: -1])
temp_row = list()
temp_row.insert(0, row[0])
temp_row.insert(1, '')
device_cookie = int(i[0].split(',')[1])
for ip in ips:
ip = ip.strip().replace("(","").replace(")","")
ip_data_array = ip.split(',')
if device_cookie == 1:
# Cookies
temp_row[1]= ip_data_array[0]
devices_rows.append(temp_row)
else:
# Devices
temp_row[1]= ip_data_array[0]
cookies_rows.append(temp_row)
print 'end'
f.close()
devices_rows = sorted(devices_rows, key=itemgetter(0))
output = open('/Users/agupta/Downloads/device-data-sets/devices_ip_arpit.csv', 'wb')
open_file_object = csv.writer(output)
open_file_object.writerow(title)
open_file_object.writerows(devices_rows)
output.close()
cookies_rows = sorted(devices_rows, key=itemgetter(0))
output = open('/Users/agupta/Downloads/device-data-sets/cookies_ip_arpit.csv', 'wb')
open_file_object = csv.writer(output)
open_file_object.writerow(title)
open_file_object.writerows(cookies_rows)
output.close()
In [60]:
df_cookies_ip = pd.read_csv('/Users/agupta/Downloads/device-data-sets/cookies_ip_arpit.csv', header=0)
df_devices_ip = pd.read_csv('/Users/agupta/Downloads/device-data-sets/devices_ip_arpit.csv', header=0)