notebook.community

Edit and run



In [247]:

    
import datetime
import pandas as pd
import numpy as np
import psycopg2

#sensordata = pd.read_csv(sensorcsv)
#userdata = pd.read_csv(usercsv)



In [286]:

    
try:
    connection = psycopg2.connect(database ='heatseek', user = 'heatseek', password = 'wearecoolsoweseekheat')
    cursor = connection.cursor() #Open a cursor to perform operations
    
    cursor.execute('SELECT * from users LIMIT 100') #Executes the query
    users = cursor.fetchall() #cursor.fetchone() for one line, fetchmany() for multiple lines, fetchall() for all lines
    users = pd.DataFrame(users) #Saves 'users' as a pandas dataframe
    users_header = [desc[0] for desc in cursor.description] #This gets the descriptions from cursor.description (names are in the 0th index)
    users.columns = users_header #PD array's column names
    
    cursor.execute('SELECT * FROM readings LIMIT 1000;')
    readings = cursor.fetchall()
    readings = pd.DataFrame(readings)
    readings_header = [desc[0] for desc in cursor.description]
    readings.columns = readings_header
    
    cursor.execute('SELECT * FROM sensors LIMIT 100;')
    sensors = cursor.fetchall()
    sensors = pd.DataFrame(sensors)
    sensors_header = [desc[0] for desc in cursor.description]
    sensors.columns = sensors_header
    
    cursor.close() 
    connection.close()
    
except psycopg2.DatabaseError, error:
    print 'Error %s' % error



In [244]:

    
#sensordata.user_id.unique()
#sensordata.user_id.value_counts()
#sensordata.violation.value_counts() #This returns the number of 't's and 'f's
#np.sort(userdata.id.unique())
#np.intersect1d(userdata.id.unique(), sensordata.user_id.unique()) #Returns the common ids in both the datasets.



In [289]:

    
#THIS HAS TO BE RUN ON CLEAN DATA THAT HAS REMOVED CASES WHERE SENSORS ARE POLLING FASTER THAN ONCE PER HOUR.

#Function that takes (start date, end date, sensor id), returns % of failure

def sensor_down(data, start_date, end_date, sensor_id=0):
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)

    sensor_readings = readings.loc[readings.sensor_id == sensor_id]
    
    #Converting to timestamps
    for i in sensor_readings.index.values: #Iterates through all the index values
        sensor_readings.loc[i, 'create_at'] = pd.Timestamp(sensor_readings.create_at[i])

    #Using a boolean mask to select readings between the two dates 
    #(http://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates)   
    mask = (sensor_readings['create_at'] > start_date) & (sensor_readings['create_at'] <= end_date)
    sensor_readings = sensor_readings.loc[mask] #Get all readings between the two dates
    
    #We then calculate how many hours have passed for that specific sensor and date range
    sensor_readings_start_date = sensor_readings.loc[sensor_readings.index.values[0], 'create_at']
    sensor_readings_end_date = sensor_readings.loc[sensor_readings.index.values[len(sensor_readings)-1], 'create_at']
    timedelta_in_seconds =  sensor_readings_end_date - sensor_readings_start_date #This returns Timedelta object
    timedelta_in_seconds = timedelta_in_seconds.total_seconds()
    total_number_of_hours = timedelta_in_seconds/3600
    
    proportion_of_uptime =len(sensor_readings)/total_number_of_hours
    return proportion_of_uptime

print sensor_down(readings, '2015-03-11', '2015-03-21', 26)

2.0



In [ ]:

    
#What criteria do we want to use.

#SENSOR NUMBERS AS IDENTIFIERS, SOME DEGREE OF SEVERITY OF PROBLEM (CATEGORICAL)

#for the criteria we go, how do we define "bad"
#total days
#proportion of days (or clusters?)
#temperature discrepancy
#multiple apartments in the same building (how many % of the apartments in a building and how bad)
#multiple buildings by the same landlord (how many % of the buildings a landlord owns are bad)
#are our sensors failing in a specific building?

#Getting rid of test cases:
#1. Can we just delete by test IDs?
#2. If testing was separated by a minute, we can find all test cases by looping through all users,
#  and if they have a bunch of data that was collected within minutes, delete the user?

#We first convert the string dates into datetime format
sensordata['formatteddate'] = sensordata.created_at.apply(lambda x: pd.to_datetime(x,  format = "%Y-%m-%d %H:%M"))

#Then, one way of telling if a user_id was an actual user or a test case was to calculate the average timedelta for each user_id.
#Timedeltas of 1min are tests, 1 hour are users (don't know if this is always true, but if no user has an average polling rate
#of 1 min, we can use a bunch of methods to filter away test cases step by step).

sensordata['averagetimedelta'] = 0.00 #makes a new column

for i in sensordata.user_id.unique(): #for each user
    timelist = sensordata.loc[sensordata.user_id == i, 'formatteddate'] #this gives us a list of all their times in timestamp
    timedeltas = [] 
    for j in range(1, len(timelist)-1):
        timedeltas.append(timelist.iloc[j] - timelist.iloc[j-1]) #list of differences in time between time point j and j-1
    try:
        #we print the user_id, followd by their average time delta from point j to j-1
        print i, abs(sum(timedeltas, datetime.timedelta(0)))/len(timedeltas) #the average timedelta
    except ZeroDivisionError: #some cases have too few points (and results in a zero division error)
        #instead of breaking with we encounter a zerodivisionerror, just print the following:
        print i, "Too few data points?"
    sensordata.loc[sensordata.user_id == i, 'averagetimedelta'] = averagetimedeltas.total_seconds()

#3. If user ids are recycled, we'll have to do a combination of those things.

#Some sensors



In [322]:

    
#Triage
#We want to have a measure of which users are facing the most chronic problems.

#Metric combining temperature difference and chronicity of problems

#Write some code that subsets all the violation == 't' cases
sensordataviolations = sensordata[sensordata.violation == 't'] #here it is.

#Hackiest method: just number of violations/numberof nonviolations and sort users by that
#That is, which users have had the most violations given the total number of readings 

violationsovertime = []

for i in sensordata.user_id.unique():
    nonviolations = sensordata.loc[sensordata.user_id == i, 'violation'].value_counts()['f'] #Number of violations = 'f'
    try:
        violations = sensordata.loc[sensordata.user_id == i, 'violation'].value_counts()['t'] #Number of violations = 't'
    except KeyError:
        violations = 0    
    sensordata.loc[sensordata.user_id == i, 'vfreq'] = float(violations)/float(nonviolations)
    violationsovertime.append([i, (float(violations)/float(nonviolations))])

#violations over time gives first the user_id, then the proportion of how many of their readings are violations



In [ ]:

    
#Stuff to do for fun

#Variability/consistency
#Which buildings have the least/most variable temperatures?
#For this, we just calculate within-person variability (how much do sensor temperatures by the same user) vary as a function of time
#We an use this same process to calculate variability between locations (e.g., just calculate variance for each location)



In [ ]:

    
#Now, we loop over all unique users in the dataset and generate a measure of how long they've had the sensor running
sensordata['totaltime'] = 0
sensordata['vfreq'] = 0

for i in sensordata.user_id.unique():
    firstentry = len(sensordata.loc[sensordata.user_id==i,'formatteddate']) #This gives us the index of the first timepoint
    lasttime = sensordata.loc[sensordata.user_id == i, 'formatteddate'].iloc[0] #This was the timestamp of the latest timepoint
    firsttime =  sensordata.loc[sensordata.user_id == i, 'formatteddate'].iloc[firstentry-1] #This was the timestamp of the first timepoint
    sensordata.loc[sensordata.user_id == i, 'totaltime'] = lasttime - firsttime #This is the timedelta (over how long a period readings were made)
    #print i, lasttime-firsttime