In [22]:
#0) What: Test steps 1,2,3 in local Postgres instance
#Who: Bryan/Jesse

#1) What: Create a second table(s) within Postgres. I suggest just adding "_cleaned" to the name and use the same prefix. 
#The schema will remain the same. William - any objections here? We don't have a dev/staging Postgres instance, do we? 
#Who: Jesse

#2a) What: Take Bryan's new Python and test each 'rule,' adding logic for writing to Postgres/the new 'cleaned' table.
#Who: Bryan/Jesse

#2b) What: Add logic for alerts (SendGrid); fire alert when issue found, correction made.
#Who: Jesse

#3) What: Verify that results look good in *_cleaned table(s).
#Who: Bryan/Jesse/William(?)/anyone else who wants to help test

#4) What: Set up cron job to run, initially, every 4-6 hours. Review results and make sure that:
#Data is not thrown out, unnecessarily altered/integrity is retained
#Any data corrections are done properly
#Review / tweak anything else (see Bryan's notes in Asana: "Data Cleaning Notes" under "Data Science Brainstorming")
#Who: Bryan/Jesse

#5) What: After 5-7 days of testing, 'deploy.' Cron job does not need to run as often; decide on interval. 
#Who: Jesse

#6) What: Point all relevant queries to *_cleaned table(s)
#Who: Bryan/Jesse/Emily


#Will it tell the difference between an account that is consistently down or inconstently down.
#After missing a reading, does a sensor ever get back to 100% (visualize this)
#csv of sensor ids, date, and percentage (goes to a website?)
#can it pull the peron's first and last names 
#first name last name (sensor id), dates, percentages below

#Fix tenant names


#Code to visualize stuff - for time periods for month and years

#Emily source code visualization plot %
#Talk to william or jesse about automating and letting noelle access this
#clean dat function (remove bad data)
#stats and viz stuff - 311 data, hpd data (blog posts)
#is hpd monitoring itself at resolving heat complaints?

#hpd conversion rate low, who and why? (conversaion rate = complaint to investigation) (summer blog posts)
#how many heat complaints to date? (visualization)

#A tale of two cities - adjust for pop (complaints per capita)

In [1]:
import datetime
import pandas as pd
import numpy as np
import psycopg2
import csv
import time
from datetime import date

In [37]:
try:
    connection = psycopg2.connect(database ='heatseek', user = 'heatseekroot', password = 'wearecoolsoweseekheat')
    cursor = connection.cursor() #Open a cursor to perform operations
    
    cursor.execute('SELECT * from users') #Executes the query
    users = cursor.fetchall() #cursor.fetchone() for one line, fetchmany() for multiple lines, fetchall() for all lines
    users = pd.DataFrame(users) #Saves 'users' as a pandas dataframe
    users_header = [desc[0] for desc in cursor.description] #This gets the descriptions from cursor.description 
    #(names are in the 0th index)
    users.columns = users_header #PD array's column names
    
    cursor.execute('SELECT * FROM readings;')
    readings = cursor.fetchall()
    readings = pd.DataFrame(readings)
    readings_header = [desc[0] for desc in cursor.description]
    readings.columns = readings_header
    
    cursor.execute('SELECT * FROM sensors;')
    sensors = cursor.fetchall()
    sensors = pd.DataFrame(sensors)
    sensors_header = [desc[0] for desc in cursor.description]
    sensors.columns = sensors_header

    cursor.close() 
    connection.close()
    
except psycopg2.DatabaseError, error:
    print 'Error %s' % error

In [159]:
#This creates an array 'sensors_with_users' that consists of sensors that are currently assigned to users.
sensors_with_users_raw = np.intersect1d(users.id.unique(), sensors.user_id.unique()) #Returns the common ids in both the datasets.
#sensors.loc[sensors.user_id, sensors_with_users]
sensors_with_users = []
for ids in sensors_with_users_raw:
    sensors_with_users.append(int(ids))

In [3]:
#This function returns clean readings. #It doesn't exist yet
#This function will return if a sensor is polling faster than once per hour (i.e., test cases)

def dirty_data(dirty_readings, start_date = None, end_date = None):
    if (start_date or end_date) == None:
        start_date = pd.Timestamp('2000-01-01')
        end_date = pd.Timestamp(datetime.datetime.now())
    else:
        start_date = pd.Timestamp(start_date)
        end_date = pd.Timestamp(end_date)
    
    mask = (dirty_readings['created_at'] > start_date) & (dirty_readings['created_at'] <= end_date)
    dirty_readings = dirty_readings.loc[mask]
    
    hot_ids = dirty_readings.loc[dirty_readings.temp > 90].sensor_id.unique() #Returns sensor IDs where indoor temp is > 90
    cold_ids = dirty_readings.loc[dirty_readings.temp < 40].sensor_id.unique() #Returns sensor IDs where indoor temp is < 40
    inside_colder_ids = dirty_readings.loc[dirty_readings.temp < dirty_readings.outdoor_temp].sensor_id.unique() #Returns sensor IDs where indoor temp is < outdoor temp
    #Array of all the IDs above
    all_ids = np.unique(np.concatenate((hot_ids, cold_ids, inside_colder_ids)))
    all_ids = all_ids[~np.isnan(all_ids)]
    #Create an empty dataframe with the IDs as indices
    report = pd.DataFrame(index=all_ids,columns=['UserID','SensorID', 'Outside90', 'Inside40', 'InsideColderOutside'])
    #Fill in the specific conditions as '1'
    report.Outside90 = report.loc[hot_ids].Outside90.fillna(1)
    report.Inside40 = report.loc[cold_ids].Inside40.fillna(1)
    report.InsideColderOutside = report.loc[inside_colder_ids].InsideColderOutside.fillna(1)
    report = report.fillna(0)
    report.SensorID = report.index
    
    #Fill in UserIDs
    problem_ids = sensors[sensors.id.isin(all_ids)]
    for index in report.index.values:
        index = int(index)
        try:
            report.loc[index, 'UserID'] = sensors.loc[index, 'user_id']
        except KeyError:
            report.loc[index, 'UserID']  = 'No such user in sensors table.'
    return report
        


def clean_data(dirty_readings):
    cleaner_readings = dirty_readings[dirty_readings.sensor_id.notnull()] #Remove cases where there are no sensor IDs
    return cleaner_readings
    
clean_readings = clean_data(readings)
report = dirty_data(readings)

In [8]:
#This function takes (start date, end date, sensor id), returns % of failure
def sensor_down_complete(data, start_date, end_date, sensor_id): 
    
    #This pulls up the tenant's first and last name.
    try:
        tenant_id = int(sensors.loc[sensors.id == sensor_id].user_id.values[0])
        tenant_first_name = users.loc[users.id == tenant_id].first_name.values[-1] #This pulls up the first name on the list (not the most recent)
        tenant_last_name = users.loc[users.id == tenant_id].last_name.values[-1]
    #Are these really not assigned?
    except ValueError:
        tenant_id = 'None'
        tenant_first_name = 'Not'
        tenant_last_name = 'Assigned'
    except IndexError:
        tenant_id = 'None'
        tenant_first_name = 'Not'
        tenant_last_name = 'Assigned'
        
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)

    sensor_readings = data.loc[data.sensor_id == sensor_id]
    
    #Converting to timestamps
    #for i in sensor_readings.index.values: #Iterates through all the index values
        #sensor_readings.loc[i, 'created_at'] = pd.Timestamp(sensor_readings.created_at[i])
    #Using map instead of for loop (about 15-20x faster)
    try:
        sensor_readings.loc[:, 'created_at'] = map(pd.Timestamp, sensor_readings.created_at)
    except TypeError:
        tenant_first_name = 'Mapping Error'
        tenant_last_name = 'Only One Entry'
        pass
    #Using list comprehensions (as efficient as map)
    #sensor_readings.loc[:, 'created_at'] = [pd.Timestamp(x) for x in sensor_readings.created_at]
        
    #Using a boolean mask to select readings between the two dates 
    #(http://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates)   
    mask = (sensor_readings['created_at'] > start_date) & (sensor_readings['created_at'] <= end_date)
    masked_sensor_readings = sensor_readings.loc[mask] #Get all readings between the two dates
    masked_sensor_readings = masked_sensor_readings.sort_values('created_at')
    #We then calculate how many hours have passed for that specific sensor and date range
    try:
        sensor_readings_start_date = masked_sensor_readings.loc[masked_sensor_readings.index.values[0], 'created_at']
        sensor_readings_end_date = \
        masked_sensor_readings.loc[masked_sensor_readings.index.values[len(masked_sensor_readings)-1], 'created_at']
        timedelta_in_seconds =  sensor_readings_end_date - sensor_readings_start_date #This returns Timedelta object
        timedelta_in_seconds = timedelta_in_seconds.total_seconds()
        total_number_of_hours = timedelta_in_seconds/3600 + 1 #The +1 fixes the rounding error for now but IDK why yet.
        
        hours_in_date_range = ((end_date-start_date).total_seconds())/3600 + 1
        
    except IndexError:
        return [tenant_first_name, tenant_last_name, sensor_id, tenant_id, "No valid readings during this time frame."]
    
    proportion_of_total_uptime = (len(masked_sensor_readings)/hours_in_date_range) * 100 #Proportion of uptime over TOTAL HOURS
    proportion_within_sensor_uptime = (len(masked_sensor_readings)/total_number_of_hours) * 100 #Proportion of uptime for the sensor's first and last uploaded dates.
    if proportion_within_sensor_uptime <= 100.1:
        return [tenant_first_name, tenant_last_name, sensor_id, tenant_id, proportion_of_total_uptime, proportion_within_sensor_uptime]
    else:
        return [tenant_first_name, tenant_last_name, sensor_id, tenant_id, proportion_of_total_uptime, proportion_within_sensor_uptime, 'Sensor has readings more frequent than once per hour. Check readings table.']

In [7]:
#This function takes (start date, end date, sensor id), returns % of failure
def sensor_down(data, start_date, end_date, sensor_id): 
    
    #This pulls up the tenant's first and last name.
    try:
        tenant_id = int(sensors.loc[sensors.id == sensor_id].user_id.values[0])
        tenant_first_name = users.loc[users.id == tenant_id].first_name.values[-1] #This pulls up the first name on the list (not the most recent)
        tenant_last_name = users.loc[users.id == tenant_id].last_name.values[-1]
    #Are these really not assigned?
    except ValueError:
        tenant_id = 'None'
        tenant_first_name = 'Not'
        tenant_last_name = 'Assigned'
    except IndexError:
        tenant_id = 'None'
        tenant_first_name = 'Not'
        tenant_last_name = 'Assigned'
        
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)

    sensor_readings = data.loc[data.sensor_id == sensor_id]
    
    #Converting to timestamps
    #for i in sensor_readings.index.values: #Iterates through all the index values
        #sensor_readings.loc[i, 'created_at'] = pd.Timestamp(sensor_readings.created_at[i])
    #Using map instead of for loop (about 15-20x faster)
    try:
        sensor_readings.loc[:, 'created_at'] = map(pd.Timestamp, sensor_readings.created_at)
    except TypeError:
        tenant_first_name = 'Mapping Error'
        tenant_last_name = 'Only One Entry'
        pass
    #Using list comprehensions (as efficient as map)
    #sensor_readings.loc[:, 'created_at'] = [pd.Timestamp(x) for x in sensor_readings.created_at]
        
    #Using a boolean mask to select readings between the two dates 
    #(http://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates)   
    mask = (sensor_readings['created_at'] > start_date) & (sensor_readings['created_at'] <= end_date)
    masked_sensor_readings = sensor_readings.loc[mask] #Get all readings between the two dates
    masked_sensor_readings = masked_sensor_readings.sort_values('created_at')
    #We then calculate how many hours have passed for that specific sensor and date range
    try:
        sensor_readings_start_date = masked_sensor_readings.loc[masked_sensor_readings.index.values[0], 'created_at']
        sensor_readings_end_date = \
        masked_sensor_readings.loc[masked_sensor_readings.index.values[len(masked_sensor_readings)-1], 'created_at']
        timedelta_in_seconds =  sensor_readings_end_date - sensor_readings_start_date #This returns Timedelta object
        timedelta_in_seconds = timedelta_in_seconds.total_seconds()
        total_number_of_hours = timedelta_in_seconds/3600 + 1 #The +1 fixes the rounding error for now but IDK why yet.
        
        hours_in_date_range = ((end_date-start_date).total_seconds())/3600 + 1
        proportion_of_total_uptime = (len(masked_sensor_readings)/hours_in_date_range) * 100 #Proportion of uptime over TOTAL HOURS
        proportion_within_sensor_uptime = (len(masked_sensor_readings)/total_number_of_hours) * 100 #Proportion of uptime for the sensor's first and last uploaded dates.
        
        if proportion_within_sensor_uptime <= 100.1:
            return [sensor_id, 255*(proportion_within_sensor_uptime/100), proportion_of_total_uptime, tenant_first_name, tenant_last_name, tenant_id]
        else:
            pass
    except IndexError:
        pass

In [113]:
#from the time a sensor has been deployed till now, how many % of the total hours where it is possible to receive a violation are actually in violation.
#minimum of 30 days

#account for sensor downtime
#instead of calculating proportions over 100%, we can calculate them over the % of time the sensor was actually - so basically, if it wasn't up
#we assume that there was no violation

def violation_percentages(data, start_date, end_date, sensor_id):
    
    sensor_readings = data.loc[data.sensor_id == sensor_id] #All readings for a sensorID
    try:
        sensor_readings.loc[:,'created_at'] = map(pd.Timestamp, sensor_readings.created_at) #convert all to timestampst
    except TypeError:
        pass
    
    #Filter out sensors that are < 30 days old
    try:
        sensor_readings_start_date = sensor_readings.loc[sensor_readings.index.values[0], 'created_at'].date()
        today = date.today()
        datediff = today - sensor_readings_start_date
    except IndexError:
        return "No readings in date range."
    
    if datediff.days < 30: #If a sensor has been up for < 30 days, don't do anything
        pass
    else:
        start_date = pd.Timestamp(start_date) #Convert dates to pd.Timestamp
        end_date = pd.Timestamp(end_date)
    
        mask = (sensor_readings['created_at'] > start_date) & (sensor_readings['created_at'] <= end_date) #mask for date range
        masked_sensor_readings = sensor_readings.loc[mask]

        try:
            #First, find all possible violation-hours
            ##We need to index as datetimeindex in order to use the .between_time method
            sensor_readings.set_index(pd.DatetimeIndex(sensor_readings['created_at']), inplace = True)
        
            ##This returns the a list of day and night readings
            day_readings = sensor_readings.between_time(start_time='06:00', end_time='22:00')
            night_readings = sensor_readings.between_time(start_time='22:00', end_time='6:00')

            ##Now, we count how many rows are violations and divide by total possible violation hours
            #For day, if outdoor_temp < 55
            day_total_violable_hours = len(day_readings.loc[day_readings['outdoor_temp'] < 55])
            day_actual_violation_hours = len(day_readings.loc[day_readings['violation'] == True])
            #For night, if outdoor_temp < 40
            night_total_violable_hours = len(night_readings.loc[night_readings['outdoor_temp'] < 40])
            night_actual_violation_hours = len(night_readings.loc[night_readings['violation'] == True])

            #Calculate percentage
            try:
                violation_percentage = float(day_actual_violation_hours + night_actual_violation_hours)/float(day_total_violable_hours + night_total_violable_hours)
            except ZeroDivisionError:
                return "No violations in this range."
                
            return violation_percentage #violationpercentage
        
        except IndexError:
            pass
    
unique_sensors = readings['sensor_id'].unique()

for ids in unique_sensors:
    print "Sensor ID: {0}, Violation Percentage: {1}".format(ids, violation_percentages(readings, '2016-01-01', '2016-02-07', ids))


Sensor ID: nan, Violation Percentage: No readings in date range.
Sensor ID: 19.0, Violation Percentage: 0.232402912621
Sensor ID: 26.0, Violation Percentage: 0.681012658228
Sensor ID: 22.0, Violation Percentage: 0.198473282443
Sensor ID: 29.0, Violation Percentage: 0.0
Sensor ID: 25.0, Violation Percentage: 0.602150537634
Sensor ID: 28.0, Violation Percentage: 0.0
Sensor ID: 24.0, Violation Percentage: 0.238185255198
Sensor ID: 39.0, Violation Percentage: 0.136363636364
Sensor ID: 38.0, Violation Percentage: 0.9
Sensor ID: 37.0, Violation Percentage: 0.436559139785
Sensor ID: 35.0, Violation Percentage: 0.0
Sensor ID: 33.0, Violation Percentage: 0.003663003663
Sensor ID: 32.0, Violation Percentage: 0.0
Sensor ID: 31.0, Violation Percentage: 0.00981767180926
Sensor ID: 34.0, Violation Percentage: 0.0
Sensor ID: 1.0, Violation Percentage: 1.0
Sensor ID: 8.0, Violation Percentage: No violations in this range.
Sensor ID: 43.0, Violation Percentage: 0.0
Sensor ID: 42.0, Violation Percentage: 0.841807909605
Sensor ID: 15.0, Violation Percentage: No violations in this range.
Sensor ID: 44.0, Violation Percentage: 0.14701986755
Sensor ID: 144.0, Violation Percentage: 0.124183006536
Sensor ID: 130.0, Violation Percentage: 0.108843537415
Sensor ID: 98.0, Violation Percentage: 0.211488250653
Sensor ID: 152.0, Violation Percentage: 0.027027027027
Sensor ID: 10.0, Violation Percentage: No violations in this range.
Sensor ID: 114.0, Violation Percentage: 0.771653543307
Sensor ID: 123.0, Violation Percentage: 0.0
Sensor ID: 139.0, Violation Percentage: 0.416553595658
Sensor ID: 166.0, Violation Percentage: 0.117021276596
Sensor ID: 184.0, Violation Percentage: 0.38785046729
Sensor ID: 87.0, Violation Percentage: 0.0
Sensor ID: 102.0, Violation Percentage: 1.0
Sensor ID: 93.0, Violation Percentage: 0.047619047619
Sensor ID: 132.0, Violation Percentage: 0.0
Sensor ID: 121.0, Violation Percentage: 0.222408026756
Sensor ID: 58.0, Violation Percentage: 0.00568181818182
Sensor ID: 179.0, Violation Percentage: 0.260485651214
Sensor ID: 169.0, Violation Percentage: 0.207165109034
Sensor ID: 107.0, Violation Percentage: 0.0987654320988
Sensor ID: 171.0, Violation Percentage: 0.814814814815
Sensor ID: 174.0, Violation Percentage: 0.0054347826087
Sensor ID: 89.0, Violation Percentage: 0.0505050505051
Sensor ID: 176.0, Violation Percentage: 0.0
Sensor ID: 164.0, Violation Percentage: 0.167010309278
Sensor ID: 154.0, Violation Percentage: 0.162162162162
Sensor ID: 133.0, Violation Percentage: 0.0
Sensor ID: 145.0, Violation Percentage: 0.0257913247362
Sensor ID: 167.0, Violation Percentage: 0.0
Sensor ID: 181.0, Violation Percentage: 0.0
Sensor ID: 173.0, Violation Percentage: 0.0868686868687
Sensor ID: 178.0, Violation Percentage: 0.0
Sensor ID: 135.0, Violation Percentage: 0.0714285714286
Sensor ID: 158.0, Violation Percentage: 0.0894428152493
Sensor ID: 170.0, Violation Percentage: 0.483679525223
Sensor ID: 168.0, Violation Percentage: 0.666666666667
Sensor ID: 153.0, Violation Percentage: 0.0
Sensor ID: 159.0, Violation Percentage: 0.0
Sensor ID: 90.0, Violation Percentage: 0.0
Sensor ID: 88.0, Violation Percentage: 0.0376569037657
Sensor ID: 186.0, Violation Percentage: 0.393867924528
Sensor ID: 187.0, Violation Percentage: 0.0
Sensor ID: 180.0, Violation Percentage: 0.266666666667
Sensor ID: 188.0, Violation Percentage: 0.00637958532695
Sensor ID: 161.0, Violation Percentage: 0.0
Sensor ID: 13.0, Violation Percentage: No violations in this range.
Sensor ID: 14.0, Violation Percentage: No violations in this range.
Sensor ID: 189.0, Violation Percentage: 0.473180076628
Sensor ID: 193.0, Violation Percentage: 0.126805778491
Sensor ID: 194.0, Violation Percentage: 0.154696132597
Sensor ID: 195.0, Violation Percentage: 0.0943060498221
Sensor ID: 191.0, Violation Percentage: 0.0546075085324
Sensor ID: 192.0, Violation Percentage: 0.0510948905109
Sensor ID: 296.0, Violation Percentage: 0.538461538462
Sensor ID: 155.0, Violation Percentage: 0.558823529412
Sensor ID: 267.0, Violation Percentage: 0.303370786517
Sensor ID: 276.0, Violation Percentage: 0.595463137996
Sensor ID: 283.0, Violation Percentage: 0.0
Sensor ID: 287.0, Violation Percentage: 0.0
Sensor ID: 172.0, Violation Percentage: 0.241758241758
Sensor ID: 285.0, Violation Percentage: 0.7903930131
Sensor ID: 271.0, Violation Percentage: 0.0
Sensor ID: 292.0, Violation Percentage: 0.002
Sensor ID: 11.0, Violation Percentage: 0.117614269788
Sensor ID: 160.0, Violation Percentage: 0.0
Sensor ID: 274.0, Violation Percentage: 0.124390243902
Sensor ID: 228.0, Violation Percentage: 1.04207119741
Sensor ID: 284.0, Violation Percentage: 0.0
Sensor ID: 268.0, Violation Percentage: 0.0
Sensor ID: 214.0, Violation Percentage: 0.0
Sensor ID: 282.0, Violation Percentage: 0.0
Sensor ID: 27.0, Violation Percentage: 0.0
Sensor ID: 190.0, Violation Percentage: 0.0107991360691
Sensor ID: 85.0, Violation Percentage: 0.977272727273
Sensor ID: 288.0, Violation Percentage: 0.0
Sensor ID: 273.0, Violation Percentage: 0.516129032258
Sensor ID: 21.0, Violation Percentage: 0.0

In [4]:
#This function creates a simulated dataset of readings.
def simulate_data(start_date, end_date, polling_rate, sensor_id): #polling_rate in minutes
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)

    #how many hours between the two dates:
    timedelta_in_seconds = end_date-start_date
    total_number_of_hours = timedelta_in_seconds.total_seconds()/(polling_rate*60)
    
    #Create an empty pandas dataframe
    index = xrange(1,int(total_number_of_hours)+1)
    columns = ['created_at', 'sensor_id']
    simulated_readings = pd.DataFrame(index = index, columns = columns)
    simulated_readings.loc[:,'sensor_id'] = sensor_id
    
    #Populate it with columns of 'create_at' dates
    time_counter = start_date
    for i in simulated_readings.index.values:
        simulated_readings.loc[i,'created_at'] = time_counter
        time_counter = time_counter + pd.Timedelta('00:%s:00' % polling_rate)
   
    return simulated_readings

In [37]:
#This function generates a report; we might want to make this a cron job.
def generate_report(start_date, end_date):
    report = []
    sensor_ids = readings.sensor_id.unique()
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)
    for ids in sensor_ids:
        temp = sensor_down(readings, start_date, end_date, ids)
        if temp != None:
            report.append(temp)
        else:
            pass
    return report

In [49]:
tic = time.clock()

report = generate_report('2016-02-01','2016-02-07')
header =['sensorID', 'status', 'Percentage of uptime in daterange', 'FirstName', 'LastName' , 'userID']

toc = time.clock()
toc - tic


Out[49]:
3.7389025019316477

In [17]:
tic = time.clock()

report = dirty_data(readings)
header = ['UserID', 'SensorID', 'Outside90', 'Inside40', 'InsideColderOutside']

toc = time.clock()
toc - tic


UserID
SensorID
Outside90
Inside40
InsideColderOutside

In [50]:
csvoutput = open('sensors.csv', 'wb')
writer = csv.writer(csvoutput)
writer.writerow(header)
for i in report:
    writer.writerow(i)
csvoutput.close()

In [23]:
report.to_csv('dirtydata.csv', index = False, na_rep="Not Currently Assigned")

In [98]:
#What criteria do we want to use.

#SENSOR NUMBERS AS IDENTIFIERS, SOME DEGREE OF SEVERITY OF PROBLEM (CATEGORICAL)

#for the criteria we go, how do we define "bad"
#total days
#proportion of days (or clusters?)
#temperature discrepancy
#multiple apartments in the same building (how many % of the apartments in a building and how bad)
#multiple buildings by the same landlord (how many % of the buildings a landlord owns are bad)
#are our sensors failing in a specific building?

#Getting rid of test cases:
#1. Can we just delete by test IDs?
#2. If testing was separated by a minute, we can find all test cases by looping through all users,
#  and if they have a bunch of data that was collected within minutes, delete the user?

#We first convert the string dates into datetime format
sensordata['formatteddate'] = sensordata.created_at.apply(lambda x: pd.to_datetime(x,  format = "%Y-%m-%d %H:%M"))

#Then, one way of telling if a user_id was an actual user or a test case was to calculate the average timedelta for each user_id.
#Timedeltas of 1min are tests, 1 hour are users (don't know if this is always true, but if no user has an average polling rate
#of 1 min, we can use a bunch of methods to filter away test cases step by step).

sensordata['averagetimedelta'] = 0.00 #makes a new column

for i in sensordata.user_id.unique(): #for each user
    timelist = sensordata.loc[sensordata.user_id == i, 'formatteddate'] #this gives us a list of all their times in timestamp
    timedeltas = [] 
    for j in range(1, len(timelist)-1):
        timedeltas.append(timelist.iloc[j] - timelist.iloc[j-1]) #list of differences in time between time point j and j-1
    try:
        #we print the user_id, followd by their average time delta from point j to j-1
        print i, abs(sum(timedeltas, datetime.timedelta(0)))/len(timedeltas) #the average timedelta
    except ZeroDivisionError: #some cases have too few points (and results in a zero division error)
        #instead of breaking with we encounter a zerodivisionerror, just print the following:
        print i, "Too few data points?"
    sensordata.loc[sensordata.user_id == i, 'averagetimedelta'] = averagetimedeltas.total_seconds()

#3. If user ids are recycled, we'll have to do a combination of those things.

#Some sensors


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-98-0bc7b148d9be> in <module>()
     17 
     18 #We first convert the string dates into datetime format
---> 19 sensordata['formatteddate'] = sensordata.created_at.apply(lambda x: pd.to_datetime(x,  format = "%Y-%m-%d %H:%M"))
     20 
     21 #Then, one way of telling if a user_id was an actual user or a test case was to calculate the average timedelta for each user_id.

NameError: name 'sensordata' is not defined

In [322]:
#Triage
#We want to have a measure of which users are facing the most chronic problems.

#Metric combining temperature difference and chronicity of problems

#Write some code that subsets all the violation == 't' cases
sensordataviolations = sensordata[sensordata.violation == 't'] #here it is.

#Hackiest method: just number of violations/numberof nonviolations and sort users by that
#That is, which users have had the most violations given the total number of readings 

violationsovertime = []

for i in sensordata.user_id.unique():
    nonviolations = sensordata.loc[sensordata.user_id == i, 'violation'].value_counts()['f'] #Number of violations = 'f'
    try:
        violations = sensordata.loc[sensordata.user_id == i, 'violation'].value_counts()['t'] #Number of violations = 't'
    except KeyError:
        violations = 0    
    sensordata.loc[sensordata.user_id == i, 'vfreq'] = float(violations)/float(nonviolations)
    violationsovertime.append([i, (float(violations)/float(nonviolations))])

#violations over time gives first the user_id, then the proportion of how many of their readings are violations

In [ ]:
#Stuff to do for fun

#Variability/consistency
#Which buildings have the least/most variable temperatures?
#For this, we just calculate within-person variability (how much do sensor temperatures by the same user) vary as a function of time
#We an use this same process to calculate variability between locations (e.g., just calculate variance for each location)

In [ ]:
#Now, we loop over all unique users in the dataset and generate a measure of how long they've had the sensor running
sensordata['totaltime'] = 0
sensordata['vfreq'] = 0

for i in sensordata.user_id.unique():
    firstentry = len(sensordata.loc[sensordata.user_id==i,'formatteddate']) #This gives us the index of the first timepoint
    lasttime = sensordata.loc[sensordata.user_id == i, 'formatteddate'].iloc[0] #This was the timestamp of the latest timepoint
    firsttime =  sensordata.loc[sensordata.user_id == i, 'formatteddate'].iloc[firstentry-1] #This was the timestamp of the first timepoint
    sensordata.loc[sensordata.user_id == i, 'totaltime'] = lasttime - firsttime #This is the timedelta (over how long a period readings were made)
    #print i, lasttime-firsttime

In [112]:
#sensors.user_id.value_counts()
#sensordata.violation.value_counts() #This returns the number of 't's and 'f's
#np.sort(userdata.id.unique())
#np.intersect1d(userdata.id.unique(), sensordata.user_id.unique()) #Returns the common ids in both the datasets.
#readings.sensor_id.unique