In [247]:
import datetime
import pandas as pd
import numpy as np
import psycopg2
#sensordata = pd.read_csv(sensorcsv)
#userdata = pd.read_csv(usercsv)
In [286]:
try:
connection = psycopg2.connect(database ='heatseek', user = 'heatseek', password = 'wearecoolsoweseekheat')
cursor = connection.cursor() #Open a cursor to perform operations
cursor.execute('SELECT * from users LIMIT 100') #Executes the query
users = cursor.fetchall() #cursor.fetchone() for one line, fetchmany() for multiple lines, fetchall() for all lines
users = pd.DataFrame(users) #Saves 'users' as a pandas dataframe
users_header = [desc[0] for desc in cursor.description] #This gets the descriptions from cursor.description (names are in the 0th index)
users.columns = users_header #PD array's column names
cursor.execute('SELECT * FROM readings LIMIT 1000;')
readings = cursor.fetchall()
readings = pd.DataFrame(readings)
readings_header = [desc[0] for desc in cursor.description]
readings.columns = readings_header
cursor.execute('SELECT * FROM sensors LIMIT 100;')
sensors = cursor.fetchall()
sensors = pd.DataFrame(sensors)
sensors_header = [desc[0] for desc in cursor.description]
sensors.columns = sensors_header
cursor.close()
connection.close()
except psycopg2.DatabaseError, error:
print 'Error %s' % error
In [244]:
#sensordata.user_id.unique()
#sensordata.user_id.value_counts()
#sensordata.violation.value_counts() #This returns the number of 't's and 'f's
#np.sort(userdata.id.unique())
#np.intersect1d(userdata.id.unique(), sensordata.user_id.unique()) #Returns the common ids in both the datasets.
In [289]:
#THIS HAS TO BE RUN ON CLEAN DATA THAT HAS REMOVED CASES WHERE SENSORS ARE POLLING FASTER THAN ONCE PER HOUR.
#Function that takes (start date, end date, sensor id), returns % of failure
def sensor_down(data, start_date, end_date, sensor_id=0):
start_date = pd.Timestamp(start_date)
end_date = pd.Timestamp(end_date)
sensor_readings = readings.loc[readings.sensor_id == sensor_id]
#Converting to timestamps
for i in sensor_readings.index.values: #Iterates through all the index values
sensor_readings.loc[i, 'create_at'] = pd.Timestamp(sensor_readings.create_at[i])
#Using a boolean mask to select readings between the two dates
#(http://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates)
mask = (sensor_readings['create_at'] > start_date) & (sensor_readings['create_at'] <= end_date)
sensor_readings = sensor_readings.loc[mask] #Get all readings between the two dates
#We then calculate how many hours have passed for that specific sensor and date range
sensor_readings_start_date = sensor_readings.loc[sensor_readings.index.values[0], 'create_at']
sensor_readings_end_date = sensor_readings.loc[sensor_readings.index.values[len(sensor_readings)-1], 'create_at']
timedelta_in_seconds = sensor_readings_end_date - sensor_readings_start_date #This returns Timedelta object
timedelta_in_seconds = timedelta_in_seconds.total_seconds()
total_number_of_hours = timedelta_in_seconds/3600
proportion_of_uptime =len(sensor_readings)/total_number_of_hours
return proportion_of_uptime
print sensor_down(readings, '2015-03-11', '2015-03-21', 26)
In [ ]:
#What criteria do we want to use.
#SENSOR NUMBERS AS IDENTIFIERS, SOME DEGREE OF SEVERITY OF PROBLEM (CATEGORICAL)
#for the criteria we go, how do we define "bad"
#total days
#proportion of days (or clusters?)
#temperature discrepancy
#multiple apartments in the same building (how many % of the apartments in a building and how bad)
#multiple buildings by the same landlord (how many % of the buildings a landlord owns are bad)
#are our sensors failing in a specific building?
#Getting rid of test cases:
#1. Can we just delete by test IDs?
#2. If testing was separated by a minute, we can find all test cases by looping through all users,
# and if they have a bunch of data that was collected within minutes, delete the user?
#We first convert the string dates into datetime format
sensordata['formatteddate'] = sensordata.created_at.apply(lambda x: pd.to_datetime(x, format = "%Y-%m-%d %H:%M"))
#Then, one way of telling if a user_id was an actual user or a test case was to calculate the average timedelta for each user_id.
#Timedeltas of 1min are tests, 1 hour are users (don't know if this is always true, but if no user has an average polling rate
#of 1 min, we can use a bunch of methods to filter away test cases step by step).
sensordata['averagetimedelta'] = 0.00 #makes a new column
for i in sensordata.user_id.unique(): #for each user
timelist = sensordata.loc[sensordata.user_id == i, 'formatteddate'] #this gives us a list of all their times in timestamp
timedeltas = []
for j in range(1, len(timelist)-1):
timedeltas.append(timelist.iloc[j] - timelist.iloc[j-1]) #list of differences in time between time point j and j-1
try:
#we print the user_id, followd by their average time delta from point j to j-1
print i, abs(sum(timedeltas, datetime.timedelta(0)))/len(timedeltas) #the average timedelta
except ZeroDivisionError: #some cases have too few points (and results in a zero division error)
#instead of breaking with we encounter a zerodivisionerror, just print the following:
print i, "Too few data points?"
sensordata.loc[sensordata.user_id == i, 'averagetimedelta'] = averagetimedeltas.total_seconds()
#3. If user ids are recycled, we'll have to do a combination of those things.
#Some sensors
In [322]:
#Triage
#We want to have a measure of which users are facing the most chronic problems.
#Metric combining temperature difference and chronicity of problems
#Write some code that subsets all the violation == 't' cases
sensordataviolations = sensordata[sensordata.violation == 't'] #here it is.
#Hackiest method: just number of violations/numberof nonviolations and sort users by that
#That is, which users have had the most violations given the total number of readings
violationsovertime = []
for i in sensordata.user_id.unique():
nonviolations = sensordata.loc[sensordata.user_id == i, 'violation'].value_counts()['f'] #Number of violations = 'f'
try:
violations = sensordata.loc[sensordata.user_id == i, 'violation'].value_counts()['t'] #Number of violations = 't'
except KeyError:
violations = 0
sensordata.loc[sensordata.user_id == i, 'vfreq'] = float(violations)/float(nonviolations)
violationsovertime.append([i, (float(violations)/float(nonviolations))])
#violations over time gives first the user_id, then the proportion of how many of their readings are violations
In [ ]:
#Stuff to do for fun
#Variability/consistency
#Which buildings have the least/most variable temperatures?
#For this, we just calculate within-person variability (how much do sensor temperatures by the same user) vary as a function of time
#We an use this same process to calculate variability between locations (e.g., just calculate variance for each location)
In [ ]:
#Now, we loop over all unique users in the dataset and generate a measure of how long they've had the sensor running
sensordata['totaltime'] = 0
sensordata['vfreq'] = 0
for i in sensordata.user_id.unique():
firstentry = len(sensordata.loc[sensordata.user_id==i,'formatteddate']) #This gives us the index of the first timepoint
lasttime = sensordata.loc[sensordata.user_id == i, 'formatteddate'].iloc[0] #This was the timestamp of the latest timepoint
firsttime = sensordata.loc[sensordata.user_id == i, 'formatteddate'].iloc[firstentry-1] #This was the timestamp of the first timepoint
sensordata.loc[sensordata.user_id == i, 'totaltime'] = lasttime - firsttime #This is the timedelta (over how long a period readings were made)
#print i, lasttime-firsttime