In [22]:
#0) What: Test steps 1,2,3 in local Postgres instance
#Who: Bryan/Jesse
#1) What: Create a second table(s) within Postgres. I suggest just adding "_cleaned" to the name and use the same prefix.
#The schema will remain the same. William - any objections here? We don't have a dev/staging Postgres instance, do we?
#Who: Jesse
#2a) What: Take Bryan's new Python and test each 'rule,' adding logic for writing to Postgres/the new 'cleaned' table.
#Who: Bryan/Jesse
#2b) What: Add logic for alerts (SendGrid); fire alert when issue found, correction made.
#Who: Jesse
#3) What: Verify that results look good in *_cleaned table(s).
#Who: Bryan/Jesse/William(?)/anyone else who wants to help test
#4) What: Set up cron job to run, initially, every 4-6 hours. Review results and make sure that:
#Data is not thrown out, unnecessarily altered/integrity is retained
#Any data corrections are done properly
#Review / tweak anything else (see Bryan's notes in Asana: "Data Cleaning Notes" under "Data Science Brainstorming")
#Who: Bryan/Jesse
#5) What: After 5-7 days of testing, 'deploy.' Cron job does not need to run as often; decide on interval.
#Who: Jesse
#6) What: Point all relevant queries to *_cleaned table(s)
#Who: Bryan/Jesse/Emily
#Will it tell the difference between an account that is consistently down or inconstently down.
#After missing a reading, does a sensor ever get back to 100% (visualize this)
#csv of sensor ids, date, and percentage (goes to a website?)
#can it pull the peron's first and last names
#first name last name (sensor id), dates, percentages below
#Fix tenant names
#Code to visualize stuff - for time periods for month and years
#Emily source code visualization plot %
#Talk to william or jesse about automating and letting noelle access this
#clean dat function (remove bad data)
#stats and viz stuff - 311 data, hpd data (blog posts)
#is hpd monitoring itself at resolving heat complaints?
#hpd conversion rate low, who and why? (conversaion rate = complaint to investigation) (summer blog posts)
#how many heat complaints to date? (visualization)
#A tale of two cities - adjust for pop (complaints per capita)
In [1]:
import datetime
import pandas as pd
import numpy as np
import psycopg2
import csv
import time
from datetime import date
In [37]:
try:
connection = psycopg2.connect(database ='heatseek', user = 'heatseekroot', password = 'wearecoolsoweseekheat')
cursor = connection.cursor() #Open a cursor to perform operations
cursor.execute('SELECT * from users') #Executes the query
users = cursor.fetchall() #cursor.fetchone() for one line, fetchmany() for multiple lines, fetchall() for all lines
users = pd.DataFrame(users) #Saves 'users' as a pandas dataframe
users_header = [desc[0] for desc in cursor.description] #This gets the descriptions from cursor.description
#(names are in the 0th index)
users.columns = users_header #PD array's column names
cursor.execute('SELECT * FROM readings;')
readings = cursor.fetchall()
readings = pd.DataFrame(readings)
readings_header = [desc[0] for desc in cursor.description]
readings.columns = readings_header
cursor.execute('SELECT * FROM sensors;')
sensors = cursor.fetchall()
sensors = pd.DataFrame(sensors)
sensors_header = [desc[0] for desc in cursor.description]
sensors.columns = sensors_header
cursor.close()
connection.close()
except psycopg2.DatabaseError, error:
print 'Error %s' % error
In [159]:
#This creates an array 'sensors_with_users' that consists of sensors that are currently assigned to users.
sensors_with_users_raw = np.intersect1d(users.id.unique(), sensors.user_id.unique()) #Returns the common ids in both the datasets.
#sensors.loc[sensors.user_id, sensors_with_users]
sensors_with_users = []
for ids in sensors_with_users_raw:
sensors_with_users.append(int(ids))
In [3]:
#This function returns clean readings. #It doesn't exist yet
#This function will return if a sensor is polling faster than once per hour (i.e., test cases)
def dirty_data(dirty_readings, start_date = None, end_date = None):
if (start_date or end_date) == None:
start_date = pd.Timestamp('2000-01-01')
end_date = pd.Timestamp(datetime.datetime.now())
else:
start_date = pd.Timestamp(start_date)
end_date = pd.Timestamp(end_date)
mask = (dirty_readings['created_at'] > start_date) & (dirty_readings['created_at'] <= end_date)
dirty_readings = dirty_readings.loc[mask]
hot_ids = dirty_readings.loc[dirty_readings.temp > 90].sensor_id.unique() #Returns sensor IDs where indoor temp is > 90
cold_ids = dirty_readings.loc[dirty_readings.temp < 40].sensor_id.unique() #Returns sensor IDs where indoor temp is < 40
inside_colder_ids = dirty_readings.loc[dirty_readings.temp < dirty_readings.outdoor_temp].sensor_id.unique() #Returns sensor IDs where indoor temp is < outdoor temp
#Array of all the IDs above
all_ids = np.unique(np.concatenate((hot_ids, cold_ids, inside_colder_ids)))
all_ids = all_ids[~np.isnan(all_ids)]
#Create an empty dataframe with the IDs as indices
report = pd.DataFrame(index=all_ids,columns=['UserID','SensorID', 'Outside90', 'Inside40', 'InsideColderOutside'])
#Fill in the specific conditions as '1'
report.Outside90 = report.loc[hot_ids].Outside90.fillna(1)
report.Inside40 = report.loc[cold_ids].Inside40.fillna(1)
report.InsideColderOutside = report.loc[inside_colder_ids].InsideColderOutside.fillna(1)
report = report.fillna(0)
report.SensorID = report.index
#Fill in UserIDs
problem_ids = sensors[sensors.id.isin(all_ids)]
for index in report.index.values:
index = int(index)
try:
report.loc[index, 'UserID'] = sensors.loc[index, 'user_id']
except KeyError:
report.loc[index, 'UserID'] = 'No such user in sensors table.'
return report
def clean_data(dirty_readings):
cleaner_readings = dirty_readings[dirty_readings.sensor_id.notnull()] #Remove cases where there are no sensor IDs
return cleaner_readings
clean_readings = clean_data(readings)
report = dirty_data(readings)
In [8]:
#This function takes (start date, end date, sensor id), returns % of failure
def sensor_down_complete(data, start_date, end_date, sensor_id):
#This pulls up the tenant's first and last name.
try:
tenant_id = int(sensors.loc[sensors.id == sensor_id].user_id.values[0])
tenant_first_name = users.loc[users.id == tenant_id].first_name.values[-1] #This pulls up the first name on the list (not the most recent)
tenant_last_name = users.loc[users.id == tenant_id].last_name.values[-1]
#Are these really not assigned?
except ValueError:
tenant_id = 'None'
tenant_first_name = 'Not'
tenant_last_name = 'Assigned'
except IndexError:
tenant_id = 'None'
tenant_first_name = 'Not'
tenant_last_name = 'Assigned'
start_date = pd.Timestamp(start_date)
end_date = pd.Timestamp(end_date)
sensor_readings = data.loc[data.sensor_id == sensor_id]
#Converting to timestamps
#for i in sensor_readings.index.values: #Iterates through all the index values
#sensor_readings.loc[i, 'created_at'] = pd.Timestamp(sensor_readings.created_at[i])
#Using map instead of for loop (about 15-20x faster)
try:
sensor_readings.loc[:, 'created_at'] = map(pd.Timestamp, sensor_readings.created_at)
except TypeError:
tenant_first_name = 'Mapping Error'
tenant_last_name = 'Only One Entry'
pass
#Using list comprehensions (as efficient as map)
#sensor_readings.loc[:, 'created_at'] = [pd.Timestamp(x) for x in sensor_readings.created_at]
#Using a boolean mask to select readings between the two dates
#(http://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates)
mask = (sensor_readings['created_at'] > start_date) & (sensor_readings['created_at'] <= end_date)
masked_sensor_readings = sensor_readings.loc[mask] #Get all readings between the two dates
masked_sensor_readings = masked_sensor_readings.sort_values('created_at')
#We then calculate how many hours have passed for that specific sensor and date range
try:
sensor_readings_start_date = masked_sensor_readings.loc[masked_sensor_readings.index.values[0], 'created_at']
sensor_readings_end_date = \
masked_sensor_readings.loc[masked_sensor_readings.index.values[len(masked_sensor_readings)-1], 'created_at']
timedelta_in_seconds = sensor_readings_end_date - sensor_readings_start_date #This returns Timedelta object
timedelta_in_seconds = timedelta_in_seconds.total_seconds()
total_number_of_hours = timedelta_in_seconds/3600 + 1 #The +1 fixes the rounding error for now but IDK why yet.
hours_in_date_range = ((end_date-start_date).total_seconds())/3600 + 1
except IndexError:
return [tenant_first_name, tenant_last_name, sensor_id, tenant_id, "No valid readings during this time frame."]
proportion_of_total_uptime = (len(masked_sensor_readings)/hours_in_date_range) * 100 #Proportion of uptime over TOTAL HOURS
proportion_within_sensor_uptime = (len(masked_sensor_readings)/total_number_of_hours) * 100 #Proportion of uptime for the sensor's first and last uploaded dates.
if proportion_within_sensor_uptime <= 100.1:
return [tenant_first_name, tenant_last_name, sensor_id, tenant_id, proportion_of_total_uptime, proportion_within_sensor_uptime]
else:
return [tenant_first_name, tenant_last_name, sensor_id, tenant_id, proportion_of_total_uptime, proportion_within_sensor_uptime, 'Sensor has readings more frequent than once per hour. Check readings table.']
In [7]:
#This function takes (start date, end date, sensor id), returns % of failure
def sensor_down(data, start_date, end_date, sensor_id):
#This pulls up the tenant's first and last name.
try:
tenant_id = int(sensors.loc[sensors.id == sensor_id].user_id.values[0])
tenant_first_name = users.loc[users.id == tenant_id].first_name.values[-1] #This pulls up the first name on the list (not the most recent)
tenant_last_name = users.loc[users.id == tenant_id].last_name.values[-1]
#Are these really not assigned?
except ValueError:
tenant_id = 'None'
tenant_first_name = 'Not'
tenant_last_name = 'Assigned'
except IndexError:
tenant_id = 'None'
tenant_first_name = 'Not'
tenant_last_name = 'Assigned'
start_date = pd.Timestamp(start_date)
end_date = pd.Timestamp(end_date)
sensor_readings = data.loc[data.sensor_id == sensor_id]
#Converting to timestamps
#for i in sensor_readings.index.values: #Iterates through all the index values
#sensor_readings.loc[i, 'created_at'] = pd.Timestamp(sensor_readings.created_at[i])
#Using map instead of for loop (about 15-20x faster)
try:
sensor_readings.loc[:, 'created_at'] = map(pd.Timestamp, sensor_readings.created_at)
except TypeError:
tenant_first_name = 'Mapping Error'
tenant_last_name = 'Only One Entry'
pass
#Using list comprehensions (as efficient as map)
#sensor_readings.loc[:, 'created_at'] = [pd.Timestamp(x) for x in sensor_readings.created_at]
#Using a boolean mask to select readings between the two dates
#(http://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates)
mask = (sensor_readings['created_at'] > start_date) & (sensor_readings['created_at'] <= end_date)
masked_sensor_readings = sensor_readings.loc[mask] #Get all readings between the two dates
masked_sensor_readings = masked_sensor_readings.sort_values('created_at')
#We then calculate how many hours have passed for that specific sensor and date range
try:
sensor_readings_start_date = masked_sensor_readings.loc[masked_sensor_readings.index.values[0], 'created_at']
sensor_readings_end_date = \
masked_sensor_readings.loc[masked_sensor_readings.index.values[len(masked_sensor_readings)-1], 'created_at']
timedelta_in_seconds = sensor_readings_end_date - sensor_readings_start_date #This returns Timedelta object
timedelta_in_seconds = timedelta_in_seconds.total_seconds()
total_number_of_hours = timedelta_in_seconds/3600 + 1 #The +1 fixes the rounding error for now but IDK why yet.
hours_in_date_range = ((end_date-start_date).total_seconds())/3600 + 1
proportion_of_total_uptime = (len(masked_sensor_readings)/hours_in_date_range) * 100 #Proportion of uptime over TOTAL HOURS
proportion_within_sensor_uptime = (len(masked_sensor_readings)/total_number_of_hours) * 100 #Proportion of uptime for the sensor's first and last uploaded dates.
if proportion_within_sensor_uptime <= 100.1:
return [sensor_id, 255*(proportion_within_sensor_uptime/100), proportion_of_total_uptime, tenant_first_name, tenant_last_name, tenant_id]
else:
pass
except IndexError:
pass
In [113]:
#from the time a sensor has been deployed till now, how many % of the total hours where it is possible to receive a violation are actually in violation.
#minimum of 30 days
#account for sensor downtime
#instead of calculating proportions over 100%, we can calculate them over the % of time the sensor was actually - so basically, if it wasn't up
#we assume that there was no violation
def violation_percentages(data, start_date, end_date, sensor_id):
sensor_readings = data.loc[data.sensor_id == sensor_id] #All readings for a sensorID
try:
sensor_readings.loc[:,'created_at'] = map(pd.Timestamp, sensor_readings.created_at) #convert all to timestampst
except TypeError:
pass
#Filter out sensors that are < 30 days old
try:
sensor_readings_start_date = sensor_readings.loc[sensor_readings.index.values[0], 'created_at'].date()
today = date.today()
datediff = today - sensor_readings_start_date
except IndexError:
return "No readings in date range."
if datediff.days < 30: #If a sensor has been up for < 30 days, don't do anything
pass
else:
start_date = pd.Timestamp(start_date) #Convert dates to pd.Timestamp
end_date = pd.Timestamp(end_date)
mask = (sensor_readings['created_at'] > start_date) & (sensor_readings['created_at'] <= end_date) #mask for date range
masked_sensor_readings = sensor_readings.loc[mask]
try:
#First, find all possible violation-hours
##We need to index as datetimeindex in order to use the .between_time method
sensor_readings.set_index(pd.DatetimeIndex(sensor_readings['created_at']), inplace = True)
##This returns the a list of day and night readings
day_readings = sensor_readings.between_time(start_time='06:00', end_time='22:00')
night_readings = sensor_readings.between_time(start_time='22:00', end_time='6:00')
##Now, we count how many rows are violations and divide by total possible violation hours
#For day, if outdoor_temp < 55
day_total_violable_hours = len(day_readings.loc[day_readings['outdoor_temp'] < 55])
day_actual_violation_hours = len(day_readings.loc[day_readings['violation'] == True])
#For night, if outdoor_temp < 40
night_total_violable_hours = len(night_readings.loc[night_readings['outdoor_temp'] < 40])
night_actual_violation_hours = len(night_readings.loc[night_readings['violation'] == True])
#Calculate percentage
try:
violation_percentage = float(day_actual_violation_hours + night_actual_violation_hours)/float(day_total_violable_hours + night_total_violable_hours)
except ZeroDivisionError:
return "No violations in this range."
return violation_percentage #violationpercentage
except IndexError:
pass
unique_sensors = readings['sensor_id'].unique()
for ids in unique_sensors:
print "Sensor ID: {0}, Violation Percentage: {1}".format(ids, violation_percentages(readings, '2016-01-01', '2016-02-07', ids))
In [4]:
#This function creates a simulated dataset of readings.
def simulate_data(start_date, end_date, polling_rate, sensor_id): #polling_rate in minutes
start_date = pd.Timestamp(start_date)
end_date = pd.Timestamp(end_date)
#how many hours between the two dates:
timedelta_in_seconds = end_date-start_date
total_number_of_hours = timedelta_in_seconds.total_seconds()/(polling_rate*60)
#Create an empty pandas dataframe
index = xrange(1,int(total_number_of_hours)+1)
columns = ['created_at', 'sensor_id']
simulated_readings = pd.DataFrame(index = index, columns = columns)
simulated_readings.loc[:,'sensor_id'] = sensor_id
#Populate it with columns of 'create_at' dates
time_counter = start_date
for i in simulated_readings.index.values:
simulated_readings.loc[i,'created_at'] = time_counter
time_counter = time_counter + pd.Timedelta('00:%s:00' % polling_rate)
return simulated_readings
In [37]:
#This function generates a report; we might want to make this a cron job.
def generate_report(start_date, end_date):
report = []
sensor_ids = readings.sensor_id.unique()
start_date = pd.Timestamp(start_date)
end_date = pd.Timestamp(end_date)
for ids in sensor_ids:
temp = sensor_down(readings, start_date, end_date, ids)
if temp != None:
report.append(temp)
else:
pass
return report
In [49]:
tic = time.clock()
report = generate_report('2016-02-01','2016-02-07')
header =['sensorID', 'status', 'Percentage of uptime in daterange', 'FirstName', 'LastName' , 'userID']
toc = time.clock()
toc - tic
Out[49]:
In [17]:
tic = time.clock()
report = dirty_data(readings)
header = ['UserID', 'SensorID', 'Outside90', 'Inside40', 'InsideColderOutside']
toc = time.clock()
toc - tic
In [50]:
csvoutput = open('sensors.csv', 'wb')
writer = csv.writer(csvoutput)
writer.writerow(header)
for i in report:
writer.writerow(i)
csvoutput.close()
In [23]:
report.to_csv('dirtydata.csv', index = False, na_rep="Not Currently Assigned")
In [98]:
#What criteria do we want to use.
#SENSOR NUMBERS AS IDENTIFIERS, SOME DEGREE OF SEVERITY OF PROBLEM (CATEGORICAL)
#for the criteria we go, how do we define "bad"
#total days
#proportion of days (or clusters?)
#temperature discrepancy
#multiple apartments in the same building (how many % of the apartments in a building and how bad)
#multiple buildings by the same landlord (how many % of the buildings a landlord owns are bad)
#are our sensors failing in a specific building?
#Getting rid of test cases:
#1. Can we just delete by test IDs?
#2. If testing was separated by a minute, we can find all test cases by looping through all users,
# and if they have a bunch of data that was collected within minutes, delete the user?
#We first convert the string dates into datetime format
sensordata['formatteddate'] = sensordata.created_at.apply(lambda x: pd.to_datetime(x, format = "%Y-%m-%d %H:%M"))
#Then, one way of telling if a user_id was an actual user or a test case was to calculate the average timedelta for each user_id.
#Timedeltas of 1min are tests, 1 hour are users (don't know if this is always true, but if no user has an average polling rate
#of 1 min, we can use a bunch of methods to filter away test cases step by step).
sensordata['averagetimedelta'] = 0.00 #makes a new column
for i in sensordata.user_id.unique(): #for each user
timelist = sensordata.loc[sensordata.user_id == i, 'formatteddate'] #this gives us a list of all their times in timestamp
timedeltas = []
for j in range(1, len(timelist)-1):
timedeltas.append(timelist.iloc[j] - timelist.iloc[j-1]) #list of differences in time between time point j and j-1
try:
#we print the user_id, followd by their average time delta from point j to j-1
print i, abs(sum(timedeltas, datetime.timedelta(0)))/len(timedeltas) #the average timedelta
except ZeroDivisionError: #some cases have too few points (and results in a zero division error)
#instead of breaking with we encounter a zerodivisionerror, just print the following:
print i, "Too few data points?"
sensordata.loc[sensordata.user_id == i, 'averagetimedelta'] = averagetimedeltas.total_seconds()
#3. If user ids are recycled, we'll have to do a combination of those things.
#Some sensors
In [322]:
#Triage
#We want to have a measure of which users are facing the most chronic problems.
#Metric combining temperature difference and chronicity of problems
#Write some code that subsets all the violation == 't' cases
sensordataviolations = sensordata[sensordata.violation == 't'] #here it is.
#Hackiest method: just number of violations/numberof nonviolations and sort users by that
#That is, which users have had the most violations given the total number of readings
violationsovertime = []
for i in sensordata.user_id.unique():
nonviolations = sensordata.loc[sensordata.user_id == i, 'violation'].value_counts()['f'] #Number of violations = 'f'
try:
violations = sensordata.loc[sensordata.user_id == i, 'violation'].value_counts()['t'] #Number of violations = 't'
except KeyError:
violations = 0
sensordata.loc[sensordata.user_id == i, 'vfreq'] = float(violations)/float(nonviolations)
violationsovertime.append([i, (float(violations)/float(nonviolations))])
#violations over time gives first the user_id, then the proportion of how many of their readings are violations
In [ ]:
#Stuff to do for fun
#Variability/consistency
#Which buildings have the least/most variable temperatures?
#For this, we just calculate within-person variability (how much do sensor temperatures by the same user) vary as a function of time
#We an use this same process to calculate variability between locations (e.g., just calculate variance for each location)
In [ ]:
#Now, we loop over all unique users in the dataset and generate a measure of how long they've had the sensor running
sensordata['totaltime'] = 0
sensordata['vfreq'] = 0
for i in sensordata.user_id.unique():
firstentry = len(sensordata.loc[sensordata.user_id==i,'formatteddate']) #This gives us the index of the first timepoint
lasttime = sensordata.loc[sensordata.user_id == i, 'formatteddate'].iloc[0] #This was the timestamp of the latest timepoint
firsttime = sensordata.loc[sensordata.user_id == i, 'formatteddate'].iloc[firstentry-1] #This was the timestamp of the first timepoint
sensordata.loc[sensordata.user_id == i, 'totaltime'] = lasttime - firsttime #This is the timedelta (over how long a period readings were made)
#print i, lasttime-firsttime
In [112]:
#sensors.user_id.value_counts()
#sensordata.violation.value_counts() #This returns the number of 't's and 'f's
#np.sort(userdata.id.unique())
#np.intersect1d(userdata.id.unique(), sensordata.user_id.unique()) #Returns the common ids in both the datasets.
#readings.sensor_id.unique