In [1]:
import pandas as pd
from random import randint
import numpy as np
import cPickle as pickle
from copy import deepcopy
In [2]:
data = open('WolframDataDropRawData_DD9166f904472-c765-49bc-8ef6-c2d5f30112ae.tsv', 'r')
In [3]:
data_str=data.read()
In [4]:
to_fix={}
to_fix[' \t']=' '
to_fix['=\t']=''
to_fix['=\n']=''
to_fix['=\r']=''
to_fix['\tquot']=' quot'
to_fix['\tamp']=' amp'
for fix in to_fix.keys():
fixed_data=data_str.replace(fix,to_fix[fix])
rows=fixed_data.split('\n')
for fix in to_fix.keys():
rows=[r.replace(fix,to_fix[fix]) for r in rows]
rows = [r for r in rows[:]]
In [5]:
parsed_rows = [r.split('\t') for r in rows]
print len(rows), len(parsed_rows)
In [6]:
key_vals = {}
for ix, row in enumerate(parsed_rows):
key_vals[ix] = {}
for e in row:
if '=' in e:
field_split=e.split('=',1)
key=field_split[0].lower().strip()
value=field_split[1]
key_vals[ix][key] = value
In [7]:
df = pd.DataFrame.from_dict(key_vals)
df = df.T
In [8]:
df.to_csv('datadrop_output.csv',sep='|', index_col=False)
In [9]:
df_weather = df[df.actor == ' weather']
In [10]:
new_weather_values = []
hours = range(24)
hours.remove(19)
for ix, data in df_weather.iterrows():
time = pd.to_datetime(data.timestamp)
year = str(time.year)
month = str(time.month)
day = str(time.day)
value3_type = data.value3_type
activity_type = data.activity_type
actor = 'weather'
col_list = ['activity_type', 'actor', 'timestamp', 'value3', 'value3_type']
for hour in hours:
if (hour-7) > 9 or (hour-7) < 20:
#value = high temp, value2 = low temp, value3 = current temp
value3 = randint(int(data.value3), int(data.value)+3)
else:
value3 = randint(int(data.value2), int(data.value3))
datetime = pd.to_datetime(day+'/'+month+'/'+year+'/'+str(hour) ,format='%d/%m/%Y/%H')
col_values = [activity_type, actor, datetime, value3, value3_type]
temp_list = []
for col in df_weather.columns:
if col in col_list:
temp_list.append(col_values[col_list.index(col)])
else:
temp_list.append(np.nan)
new_weather_values.append(tuple(temp_list))
del temp_list
In [11]:
df_new_weather_values = df_weather.append(pd.DataFrame(new_weather_values, columns=df_weather.columns))
In [12]:
actor_name_lookup = pickle.load(open('actor_name_lookup.pkl', 'rb'))
In [13]:
def update_name(activity, actor):
if activity is np.nan:
return actor
elif actor is np.nan:
if activity in actor_name_lookup:
return actor_name_lookup[activity]
new_name = raw_input('What is the actor of this activity %s: ' % (activity))
actor_name_lookup[activity] = new_name
return new_name
else:
return actor
In [14]:
new_actor_list = []
for ix, row in df.iterrows():
new_actor_list.append(update_name(row.activity_type, row.actor))
In [15]:
pickle.dump(actor_name_lookup, open('actor_name_lookup.pkl', 'wb'))
In [16]:
df.actor = [x.strip() if type(x) == type('') else x for x in new_actor_list]
In [17]:
df.actor = ['Colin' if x != 'weather' else x for x in list(df.actor)]
In [18]:
df_new_user = df[df.actor == 'Colin']
In [19]:
df_new_user.actor = ['Xavier' for x in list(df_new_user.actor)]
In [20]:
df = df.append(df_new_user)
In [21]:
df_with_weather = df.append(pd.DataFrame(new_weather_values, columns=df_weather.columns))
In [22]:
df_with_weather.to_csv('datadrop_output_with_weather.csv',sep='|', index_col=False)
In [23]: