In [1]:
import pandas as pd
from random import randint
import numpy as np
import cPickle as pickle
from copy import deepcopy

In [2]:
data = open('WolframDataDropRawData_DD9166f904472-c765-49bc-8ef6-c2d5f30112ae.tsv', 'r')

In [3]:
data_str=data.read()

In [4]:
to_fix={}

to_fix[' \t']=' '
to_fix['=\t']=''
to_fix['=\n']=''
to_fix['=\r']=''
to_fix['\tquot']=' quot'
to_fix['\tamp']=' amp'


for fix in to_fix.keys():
    fixed_data=data_str.replace(fix,to_fix[fix])


rows=fixed_data.split('\n')

for fix in to_fix.keys():
    rows=[r.replace(fix,to_fix[fix]) for r in rows]


rows = [r for r in rows[:]]

In [5]:
parsed_rows = [r.split('\t') for r in rows]
print len(rows), len(parsed_rows)


1266 1266

In [6]:
key_vals = {}
for ix, row in enumerate(parsed_rows):
    key_vals[ix] = {}
    for e in row:
        if '=' in e:
            field_split=e.split('=',1)
            key=field_split[0].lower().strip()
            value=field_split[1]
            
            key_vals[ix][key] = value

In [7]:
df = pd.DataFrame.from_dict(key_vals)
df = df.T

In [8]:
df.to_csv('datadrop_output.csv',sep='|', index_col=False)

In [9]:
df_weather = df[df.actor == ' weather']

In [10]:
new_weather_values = []
hours = range(24)
hours.remove(19)
for ix, data in df_weather.iterrows():
    time = pd.to_datetime(data.timestamp)
    year = str(time.year)
    month = str(time.month)
    day = str(time.day)
    value3_type = data.value3_type
    activity_type = data.activity_type
    actor = 'weather'
    col_list = ['activity_type', 'actor', 'timestamp', 'value3', 'value3_type']
    for hour in hours:
        if (hour-7) > 9 or (hour-7) < 20: 
            #value = high temp, value2 = low temp, value3 = current temp
            value3 = randint(int(data.value3), int(data.value)+3)
        else:
            value3 = randint(int(data.value2), int(data.value3))
        datetime = pd.to_datetime(day+'/'+month+'/'+year+'/'+str(hour) ,format='%d/%m/%Y/%H')
        col_values = [activity_type, actor, datetime, value3, value3_type]
    
        temp_list = []
        for col in df_weather.columns:
            if col in col_list:
                temp_list.append(col_values[col_list.index(col)])
            else:
                temp_list.append(np.nan)
        new_weather_values.append(tuple(temp_list))
        del temp_list

In [11]:
df_new_weather_values = df_weather.append(pd.DataFrame(new_weather_values, columns=df_weather.columns))

Add Names


In [12]:
actor_name_lookup = pickle.load(open('actor_name_lookup.pkl', 'rb'))

In [13]:
def update_name(activity, actor):
    if activity is np.nan:
        return actor
    elif actor is np.nan:
        if activity in actor_name_lookup:
            return actor_name_lookup[activity]
        new_name = raw_input('What is the actor of this activity %s: ' % (activity))
        actor_name_lookup[activity] = new_name
        return new_name
        
    else:
        return actor

In [14]:
new_actor_list = []
for ix, row in df.iterrows():
    new_actor_list.append(update_name(row.activity_type, row.actor))

In [15]:
pickle.dump(actor_name_lookup, open('actor_name_lookup.pkl', 'wb'))

In [16]:
df.actor = [x.strip() if type(x) == type('') else x for x in new_actor_list]

In [17]:
df.actor = ['Colin' if x != 'weather' else x for x in list(df.actor)]

Add new user


In [18]:
df_new_user = df[df.actor == 'Colin']

In [19]:
df_new_user.actor = ['Xavier' for x in list(df_new_user.actor)]


/Users/colingerber/Documents/I_School_Classes/Internet_Of_Things/env/lib/python2.7/site-packages/pandas/core/generic.py:1974: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

In [20]:
df = df.append(df_new_user)

In [21]:
df_with_weather = df.append(pd.DataFrame(new_weather_values, columns=df_weather.columns))

In [22]:
df_with_weather.to_csv('datadrop_output_with_weather.csv',sep='|', index_col=False)

In [23]: