In [1]:
import pandas as pd

#dataframes 
df_dev_train = pd.read_csv('/Users/agupta/Downloads/device-data-sets/dev_train_basic.csv', header=0)
df_cookies = pd.read_csv('/Users/agupta/Downloads/device-data-sets/cookie_all_basic.csv', header=0)

#df_id_all_property = pd.read_csv('/Users/agupta/Downloads/device-data-sets/id_all_property.csv', header=0)
#df_id_all_ip = pd.read_csv('/Users/agupta/Downloads/device-data-sets/id_all_ip.csv', header=0)

In [18]:
import re
from operator import itemgetter
import csv

In [9]:
df_dev_test = pd.read_csv('/Users/agupta/Downloads/device-data-sets/dev_test_basic.csv', header=0)

Insights

Cookies

  1. Number of OS Types 251

Challenge

  1. Number of rows/devices 61156

In [11]:
print len(df_dev_test)


61156

In [62]:
df_dev_test_device = df_dev_test['device_id']

In [59]:
f = open('/Users/agupta/Downloads/device-data-sets/id_all_ip.csv')         
devices_rows = list()
cookies_rows = list()
r = re.compile('(\(\w+,\w+,\w+,\w+,\w+,\w+,\w+\))')
r2 = re.compile(',')
title = ['id', 'ip']
for t, i in enumerate(f):
    if t == 0:
        continue
    row = list()
    i = i.strip().split('{')
    
    row = i[0].split(',')[: -1]
    if row[1] == '0':
        continue   
    ips = r.findall(i[1][: -1])

    temp_row = list()
    temp_row.insert(0, row[0])
    temp_row.insert(1, '')
    device_cookie = int(i[0].split(',')[1]) 


    for ip in ips:
        ip = ip.strip().replace("(","").replace(")","")
        ip_data_array = ip.split(',')
        if  device_cookie == 1:
            # Cookies 
            temp_row[1]= ip_data_array[0]
            devices_rows.append(temp_row)
        else:
            # Devices
            temp_row[1]= ip_data_array[0]
            cookies_rows.append(temp_row)
print 'end'  
f.close()
devices_rows = sorted(devices_rows, key=itemgetter(0))
output = open('/Users/agupta/Downloads/device-data-sets/devices_ip_arpit.csv', 'wb')
open_file_object = csv.writer(output)
open_file_object.writerow(title)
open_file_object.writerows(devices_rows)
output.close()

cookies_rows = sorted(devices_rows, key=itemgetter(0))
output = open('/Users/agupta/Downloads/device-data-sets/cookies_ip_arpit.csv', 'wb')
open_file_object = csv.writer(output)
open_file_object.writerow(title)
open_file_object.writerows(cookies_rows)
output.close()


 end

In [60]:
df_cookies_ip = pd.read_csv('/Users/agupta/Downloads/device-data-sets/cookies_ip_arpit.csv', header=0)
df_devices_ip = pd.read_csv('/Users/agupta/Downloads/device-data-sets/devices_ip_arpit.csv', header=0)