In [1]:
###
### 1. IMPORT MODULES
###
import pandas as pd
import time
import os
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from random import shuffle
import pickle
import copy
def print_elapsed_time():
print "%0.1f minutes elapsed since 'start' variable reset" % ((time.time() - start)/60)
def float_proportion(thelist):
counter = 0
try:
for item in thelist:
if item != int(item):
counter += 1
return counter * 1.0 / len(thelist)
except:
return -1
def increment_dict(d, key, increment=1):
if key in d.keys():
d[key] += increment
else:
d[key] = increment
return d
parameter_names = ['min',
'median',
'max',
'interquartile',
'stdev',
'float_prop',
'winter_corr',
'day_corr',
'jan_median',
'jul_median',
'zero_prop',
'noon_midn_med_dif']
In [ ]:
###
### 2. SPLIT BIG CSV INTO 10 SMALLER CSVs
###
# nb without splitting contiguous values in the filenames field
do_split = False # since this is a lengthy process that only has to be done once,
# by default it is switched off
if do_split == True:
def split_file(filepath, lines=numlines):
"""Split a file based on a number of lines, after a change in the first field of a csv. Presumes there is always at least one
field change between each block of lines."""
counter = 0
path, filename = os.path.split(filepath)
# filename.split('.') would not work for filenames with more than one .
basename, ext = os.path.splitext(filename)
# open input file
with open(filepath, 'r') as f_in:
try:
# open the first output file
f_out = open(os.path.join(path, '{}_{}{}'.format(basename, counter, ext)), 'w')
status = 'incomplete'
# loop over all lines in the input file, and number them
for i, line in enumerate(f_in):
if i == 0:
firstline = line
if status == 'waiting for field change':
if line.strip().split(',')[0] != field:
f_out.close()
counter += 1
print counter
f_out = open(os.path.join(path, '{}_{}{}'.format(basename, counter, ext)), 'w')
status = 'incomplete'
f_out.write(firstline)
if (i > 5 and i % lines == 0):
status = 'waiting for field change'
field = line.strip().split(',')[0]
# write the line to the output file
if i <> 1: # remove second line of file, which is not data
f_out.write(line)
finally:
# close the last output file
f_out.close()
split_file("C:/Users/David/Documents/IPython Notebooks/elancematthew/wxdatarawclean.csv", 567000)
In [15]:
###
### 3. ADD CALCULATED FIELDS/COLUMNS TO INPUT CSV
###
# added columns:
# datetime_iso YYYY-MM-DD-HHMM Calculated field so data and time are sortable
# winterness 0 to 6 Number of months away from July; maximum is January = 6, Dec. & Feb. = 5, etc.
# Used to estimate seasonal trends
# dayness 0 to 12 Number of hours away from midnight; noon = 12, 11am and 1pm = 11, etc.
# Used to estimate daily trends
do_pickle = False # since this is a lengthy process that only needs to be done once,
# by default it is switched off
if do_pickle == True:
for filenum in range(10):
osfilename = 'wxdatarawclean_' + str(filenum) + '.csv'
picklename = 'wxdatamunged_' + str(filenum) + '.pickle'
print osfilename
dfin = pd.read_csv(osfilename)
winterness_conversion = {1:6, 2:5, 12:5, 3:4, 11:4, 4:3, 10:3, 5:2, 9:2, 6:1, 8:1, 7:0}
start = time.time()
def add_iso(df):
try:
parsed_year = df.Date[-4:]
parsed_month = df.Date[0:2]
parsed_day = df.Date[3:5]
hour = df.Hour
month = int(parsed_month)
dayness = 12 - abs(int(hour/100) - 12)
winterness = winterness_conversion[int(parsed_month)]
# make sure there are no errors in the date and time fields
assert 1930 < int(parsed_year) < 2015
assert 1 <= int(parsed_month) <= 12
assert 1 <= int(parsed_day) <= 31
assert 0 <= hour <= 2400
date_time = parsed_year + '-' + parsed_month + '-' + parsed_day + '-'
if hour < 1000: #add leading zeroes
date_time += '0'
if hour < 10 :
date_time += '0'
if hour == 0:
date_time += '0'
date_time += str(hour)
return pd.Series({'datetime_iso': date_time, 'winterness':winterness, 'month':month, 'dayness':dayness})
except:
return pd.Series({'datetime_iso': 'ERROR', 'winterness':3, 'month':6, 'dayness':6}) # median values used
dfin = dfin.merge(dfin.apply(add_iso, axis=1), left_index=True, right_index=True)
dfin.to_pickle(picklename)
print_elapsed_time()
print "If you are reading this in stdout, all dates and times were validated as being in correct range."
In [ ]:
# output columns:
# FileName
# datetime_iso sortable YYYY-MM-DD-HHMM
# battery_voltage battery voltage (e.g. 1280 = 12.80 V)
# upper_temp upper elevation temperature (F)
# mid_temp mid elevation temperature (F)
# lower_temp lower elevation temperature (F)
# humidity relative humidity (%)
# wind_min wind speed minimum past hour (mph)
# wind_avg wind speed average past hour (mph)
# wind_max wind speed maximum past hour (mph)
# wind_dir wind direction vector average past hour (degrees from true north)
# precip precipitation past hour in hundredths (e.g. 5 = .05 inches)
# snow_24 24 hour snow depth if available (inches)
# snow_total total snow depth if available (inches)
In [21]:
###
### 4. IDENTIFY 'LOW HANGING FRUIT', columns that are most easily identified
###
do_this_part = True
if do_this_part == True:
process_all_pickles = True # to run the whole script; otherwise it's in debug mode
do_part = 9 #if it is -1, will process entire pickle; ignored if process_all_pickles == True
### ignored if process_all_pickles == True:
picklename = "wxdatamunged_1.pickle" # necessary because training set had different header capitalization
# picklename = "dfin.pickle" # training set
filename_formatted = 'FileName'
# filename_formatted = 'filename' # training set
reject_params = {}
machine_params = {}
training_params = {}
parameters = []
categories = []
samples = []
for pickle_number in range(10):
if process_all_pickles == True or pickle_number == 0:
if process_all_pickles == False:
dfin = pd.read_pickle(picklename)
else:
dfin = pd.read_pickle("wxdatamunged_"+str(pickle_number)+".pickle")
print pickle_number
filenames = list(dfin[filename_formatted].unique())
results = {}
if process_all_pickles == True or do_part == -1:
min_filenames = 0
max_filenames = len(filenames)
else:
min_filenames = do_part
max_filenames = do_part+1
for filename_pos in range(min_filenames, max_filenames):
current_filename = filenames[filename_pos]
if process_all_pickles == False and do_part != -1:
print current_filename
print "Note: these are before column ids are possibly changed to bimodal or precipitation_duplicate to undetermined."
temp_parameters = []
temp_samples = []
temp_categories = []
dfsub = dfin[dfin[filename_formatted] == current_filename]
dfsub.sort('datetime_iso')
if picklename == "dfin.pickle": #training set
columns_to_id = ['raw06', 'raw07', 'raw08', 'raw09', 'raw10', 'raw11', 'raw12', 'raw13'] #, 'raw14', 'raw15', 'raw16']
else:
columns_to_id = ['raw06', 'raw07', 'raw08', 'raw09', 'raw10', 'raw11', 'raw12', 'raw13',
'raw14', 'raw15', 'raw16']
columns_to_id = list(reversed(columns_to_id))
# note that these counters were used to build the example table for approval
# after that I considered them no longer useful as column header suffixes
# but some are still used as initial categorization criteria
# therefore due to time constraints instead of recoding,
# they are all assigned as column header suffixes and then removed later
undeterminable_counter = 0
undetermined_counter = 0
temperature_counter = 0
precipitation_counter = 0
decimal_precipitation_counter = 0
wind_dir_counter = 0
bimodal_counter = 0
cols_to_skip = []
column_ids = {}
column_position = 0
for column_position in range(len(columns_to_id)):
cur_col = columns_to_id[column_position]
colall = dfsub[cur_col]
colpos = dfsub[dfsub[cur_col] > -49][cur_col]
#### note, only values between 15th and 85th percentile are used, to eliminate outliers ####
zero_prop = len(dfsub[dfsub[cur_col] == 0.0])/(1.0*(len(dfsub)+1))
#trim to remove outliers
remove_low = False
remove_high = False
if colpos.quantile(.15) > colpos.min():
remove_low = True
if colpos.quantile(.85) < colpos.max():
remove_high = False
if remove_high == True and remove_low == True:
colpos = colpos[(colpos>colpos.quantile(.15)) & (colpos<colpos.quantile(.85))]
elif remove_high == True:
colpos = colpos[colpos<colpos.quantile(.85)]
elif remove_low == True:
colpos = colpos[colpos>colpos.quantile(.15)]
#used only for windspeed
#note that other columns are only used in this step,
#not in machine learning, to avoid overfitting
if column_position+3 < len(columns_to_id):
colpos2 = dfsub[dfsub[columns_to_id[column_position+1]] > -49][columns_to_id[column_position+1]]
float_prop = float_proportion(list(colpos))
if 0.3 < float_prop < 0.7:
bimodal_counter += 1
if column_position not in cols_to_skip:
sdev = colpos.std()
# determine whether there are enough days and months in the sample to use this metric
use_winterness, use_dayness = False, False
if (dfsub[dfsub.winterness > -99]['winterness'].min() <= 1 and dfsub[dfsub.winterness > -99]['winterness'].max() >= 5):
use_winterness = True
winter_correlation = colpos.corr(dfsub[dfsub.winterness > -99]['winterness'], method='spearman')
else:
winter_correlation = -99
if (dfsub[dfsub.dayness > -99]['dayness'].min() <= 1 and dfsub[dfsub.dayness > -99]['dayness'].max() >= 11):
use_dayness = True
day_correlation = colpos.corr(dfsub[dfsub.dayness > -99]['dayness'], method='spearman')
else:
day_correlation = -99
# for snowfall
janmedian = dfsub[(dfsub[cur_col] > -49) & (dfsub.winterness == 6)][cur_col].median()
julmedian = dfsub[(dfsub[cur_col] > -49) & (dfsub.winterness == 0)][cur_col].median()
#for machine learning
noon_midn_med_dif = (dfsub[(dfsub[cur_col] > -49) & (dfsub.dayness == 12)][cur_col].median()-
dfsub[(dfsub[cur_col] > -49) & (dfsub.dayness == 0)][cur_col].median())
# if all values are -49 or less:
if len(colpos) == 0:
undeterminable_counter += 1
column_ids[cur_col] = 'undeterminable_' + str( undeterminable_counter )
# if more than 98 percent of values are -49 or less
elif colall.quantile(.98) <= -49:
undeterminable_counter += 1
column_ids[cur_col] = 'undeterminable_' + str(undeterminable_counter)
#evaluate whether it is battery voltage
# 20th and 80th percentile are between 5 and 15, 10 and 16
# standard devation < 0.1
# float proportion greater than .5
elif (5 <= colpos.min() < 15 and
10 <= colpos.max() < 16 and
sdev < 1 and
float_prop > 0.5):
column_ids[cur_col] = "battery_voltage"
# evaluate whether it is wind direction
# DEPRECATED criteria: min 0-10, max 350-360, median is within 15 degrees of average of 25 percentile and 75 percentile
#OR maximum between 359 and 360
#
elif (column_position < len(columns_to_id) - 3 and
sdev > 10 and day_correlation < 0.1 and 90 < colpos.median() < 270 and
colpos.max() < 350):
wind_dir_counter += 1
if wind_dir_counter == 1:
column_ids[cur_col] = 'wind_dir'
if colpos2.std() > 0:
column_ids[columns_to_id[column_position+1]] = 'wind_max'
column_ids[columns_to_id[column_position+2]] = 'wind_avg'
column_ids[columns_to_id[column_position+3]] = 'wind_min'
else: #column before wind_dir is nan; either following two columns or following
#three columns are wind speeds
if column_position < len(columns_to_id) - 4:
switch = True
else:
switch = False
if switch == True:
colpos3 = dfsub[dfsub[columns_to_id[column_position+2]] > -49][columns_to_id[column_position+2]]
colpos4 = dfsub[dfsub[columns_to_id[column_position+3]] > -49][columns_to_id[column_position+3]]
colpos5 = dfsub[dfsub[columns_to_id[column_position+4]] > -49][columns_to_id[column_position+4]]
if not ( 0 <= (colpos5.quantile(.25) - colpos4.quantile(.25)) < 30 and
0 <= (colpos4.quantile(.25) - colpos3.quantile(.25)) < 30 and
0 < (colpos5.quantile(.5) - colpos4.quantile(.5)) < 30 and
0 < (colpos4.quantile(.5) - colpos3.quantile(.5)) < 30 and
0 < (colpos5.quantile(.75) - colpos4.quantile(.75)) < 30 and
0 < (colpos4.quantile(.75) - colpos3.quantile(.75)) < 30):
switch = False
if switch == True:
cols_to_skip.append(column_position+4) #other three are below
column_ids[columns_to_id[column_position+2]] = 'wind_max'
column_ids[columns_to_id[column_position+3]] = 'wind_avg'
column_ids[columns_to_id[column_position+4]] = 'wind_min'
else: # only two following columns are windspeed
column_ids[columns_to_id[column_position+2]] = 'wind_max'
column_ids[columns_to_id[column_position+3]] = 'wind_min'
if columns_to_id[column_position+1] not in column_ids.keys(): #might have already been identified as bad data above
undetermined_counter += 1
column_ids[columns_to_id[column_position+1]] = 'undetermined_' + str(undetermined_counter)
cols_to_skip.append(column_position+1)
cols_to_skip.append(column_position+2)
cols_to_skip.append(column_position+3)
else:
column_ids[cur_col] = 'wind_dir_duplicate_' + str(wind_dir_counter)
# evaluate whether it is snowfall
elif (janmedian > 30 and (np.isnan(julmedian) or julmedian == 0)):
column_ids[cur_col] = 'snow_total'
elif (janmedian > 1 and (np.isnan(julmedian) or julmedian == 0)):
column_ids[cur_col] = 'snow_24'
# evaluate whether it is precipitation
elif zero_prop > 0.3:
precipitation_counter += 1
if 0 <= colpos.median() < 1:
decimal_precipitation_counter += 1
column_ids[cur_col] = 'precipitation_' + str(precipitation_counter)
else:
column_ids[cur_col] = 'precipitation_duplicate_' + str(precipitation_counter)
### note the above code was 'hacked' from existing code; here's what happens:
# if columns is as yet uncategorized and if more than 30% of values = 0
# if median < 1, marked "precipitation_1", "precipitation_2", etc. (never just "precipitation")
# otherwise (median >-1), marked "precipitation_duplicate_1", etc.
# so they will contain suffix numbers and the word 'duplicate' even if they are not a duplicate
# this doesn't matter, as these get routinely sliced off later.
# it just serves to identify "definite" precipitation with less definite ones.
# it's inelegant, but it's a quick fit
# evaluate whether it is temperature
elif (use_winterness == True and
use_dayness == True and
colpos.max() < 100 and
0 < colpos.median() < 70 and
-0.9 <= winter_correlation <= 0.05 and
-0.1 <= day_correlation <= 0.5 and sdev > 0.1):
temperature_counter += 1
column_ids[cur_col] = 'temperature_' + str(temperature_counter)
# evaluate whether it is humidity
elif ((97 <= colpos.max() <= 100 or (colpos.quantile(.25) > 30 and
colpos.max() <= 100)) and
-0.2 < winter_correlation < 0.75 and
-0.8 < day_correlation < 0.1):
column_ids[cur_col] = 'humidity'
elif use_winterness == False or use_dayness == False:
undeterminable_counter += 1
column_ids[cur_col] = 'undeterminable_' + str( undeterminable_counter)
else:
undetermined_counter += 1
column_ids[cur_col] = 'undetermined_' + str( undetermined_counter)
if column_ids[cur_col][:8] != 'undeterm':
if np.isnan(julmedian):
julmedian= -47
category_col = column_ids[cur_col]
category_col = category_col.replace('_24', '_xxx')
for i in range(1,9):
category_col = category_col.replace('_'+str(i), '')
category_col = category_col.replace('_duplicate', '')
category_col = category_col.replace('_xxx', '_24')
key = current_filename+"__"+cur_col
temp_samples.append(key)
temp_categories.append(category_col)
templist = []
templist.append([colpos.min(),
colpos.median(),
colpos.max(),
colpos.quantile(.75) - colpos.quantile(.25),
sdev,
float_prop,
winter_correlation,
day_correlation,
janmedian,
julmedian,
zero_prop,
noon_midn_med_dif])
temp_parameters.append(templist)
training_params[key] = templist
if column_ids[cur_col][:14] == 'undeterminable':
if np.isnan(julmedian):
julmedian= -47
key = current_filename+"__"+cur_col
templist = []
templist.append([colpos.min(),
colpos.median(),
colpos.max(),
colpos.quantile(.75) - colpos.quantile(.25),
sdev,
float_prop,
winter_correlation,
day_correlation,
janmedian,
julmedian,
zero_prop,
noon_midn_med_dif])
reject_params[key] = templist
if column_ids[cur_col][:12] == 'undetermined':
if np.isnan(julmedian):
julmedian= -47
key = current_filename+"__"+cur_col
templist = []
templist.append([colpos.min(),
colpos.median(),
colpos.max(),
colpos.quantile(.75) - colpos.quantile(.25),
sdev,
float_prop,
winter_correlation,
day_correlation,
janmedian,
julmedian,
zero_prop,
noon_midn_med_dif])
machine_params[key] = templist
if len(column_ids) > len(columns_to_id):
print "Column(s) missing"
print column_ids
if process_all_pickles == False and do_part != -1: #review one filename
print cur_col.ljust(6),
print ("%0.1f" % (colpos.min())).rjust(5),
print ("%0.1f" % (colpos.quantile(.25))).rjust(5),
print ("%0.1f" % (colpos.quantile(.50))).rjust(5),
print ("%0.1f" % (colpos.quantile(.75))).rjust(5),
print ("%0.1f" % (colpos.max())).rjust(5),
print ("%0.1f" % (colpos.mean())).rjust(5),
print ("%0.3f" % (winter_correlation)).rjust(7),
print ("%0.3f" % (day_correlation)).rjust(7),
print ("%0.3f" % (zero_prop)).rjust(7),
print ("%0.3f" % (sdev)).rjust(7),
print ("%0.3f" % (float_prop)),
#print ' '
print column_ids[cur_col]
if decimal_precipitation_counter > 0:
for key in column_ids.keys():
if column_ids[key].find('precipitation_duplicate') != -1:
column_ids[key] = 'undetermined' # no suffix number given or needed because they ended up not being used
if bimodal_counter >= 3:
for key in column_ids.keys():
column_ids[key] = 'bimodal'
for position in range(len(temp_samples)):
concatname = temp_samples[position]
part1 = temp_samples[position][:-7]
part2 = temp_samples[position][-5:]
if column_ids[part2][:12] == 'undetermined':
tempvalue = training_params[concatname]
del training_params[concatname]
machine_params[concatname] = tempvalue
temp_samples[position] = 'XXXXXXXXXXXX' #not very elegant, I know...
temp_categories[position] = 'XXXXXXXXXXXX'
temp_parameters[position] = 'XXXXXXXXXXXX'
if column_ids[part2][:7] == 'bimodal':
temp_categories[position] = 'bimodal'
temp_samples = [x for x in temp_samples if x != 'XXXXXXXXXXXX']
temp_categories = [x for x in temp_categories if x != 'XXXXXXXXXXXX']
temp_parameters = [x for x in temp_parameters if x != 'XXXXXXXXXXXX']
for position in range(len(temp_categories)): #brute force removal of a bug that was resulting in strange category names
if temp_categories[position][:6] == 'precip':
temp_categories[position] = "precipitation"
samples += temp_samples
categories += temp_categories
parameters += temp_parameters
results[current_filename] = column_ids
if process_all_pickles == True:
suffix = str(pickle_number)+'.json'
with open('result_dict'+suffix, 'w+') as f:
f.write(json.dumps(results))
if process_all_pickles == True:
with open('categories.json', 'w+') as f:
f.write(json.dumps(categories))
with open('samples.json', 'w+') as f:
f.write(json.dumps(samples))
with open('parameters.json', 'w+') as f:
f.write(json.dumps(parameters))
with open('reject_params.json', 'w+') as f:
f.write(json.dumps(reject_params))
with open('machine_params.json', 'w+') as f:
f.write(json.dumps(machine_params))
with open('training_params.json', 'w+') as f:
f.write(json.dumps(training_params))
In [59]:
# auxiliary script to print median per month, then quantiles for a single column
# will only work if in above block, process_all_pickles == False and do_part != -1
# (if do_part = -1, it will work, but will show the last filename in the block)
# (if process_all_pickles == False, it will show the last filename in the entire dataset)
print pickle_number, current_filename
cc = 'raw10'
for i in range(1, 13):
print cc, i,
print dfsub[(dfsub[cc] > -49) & (dfsub.month == i)][cc].median()
for i in range(1, 13):
print cc, i,
print dfsub[(dfsub[cc] > -49) & (dfsub.dayness == i)][cc].median()
for i in range(2, 100, 2):
print str(i).rjust(3),
print str(dfsub[cc].quantile(i/100.0)).ljust(8)
x = dfsub[(dfsub[cc] > -49)][cc]
In [24]:
###
### 5. PERFORM MACHINE LEARNING (RANDOM FOREST METHOD) TO CLASSIFY UNDETERMINED COLUMNS
###
do_this_part = True
if do_this_part == True:
categories = json.loads(open('categories.json', 'r').read())
samples = json.loads(open('samples.json', 'r').read())
parameters = json.loads(open('parameters.json', 'r').read())
# replace NaN with -999
for i in range(len(parameters)):
for j in range(len(parameters[i][0])): # the [0] is to hack a bug that enclosed lists 3 deep instead of the intended 2
if np.isnan(parameters[i][0][j]):
parameters[i][0][j] = -999
# run 90% of data as training set, `0% as evaluation
subset_index = range(len(parameters))
shuffle(subset_index)
training_set_length = int(0.9 * len(parameters))
evaluation_index = subset_index[training_set_length:]
train_index = subset_index[:training_set_length]
train_samples = []
train_categories = []
train_parameters = []
for i in range(len(train_index)):
train_samples.append(samples[subset_index[i]])
train_categories.append(categories[subset_index[i]])
train_parameters.append(parameters[subset_index[i]][0])
evaluation_samples = []
evaluation_categories = []
evaluation_parameters = []
for i in range(len(evaluation_index)):
evaluation_samples.append(samples[subset_index[i]])
evaluation_categories.append(categories[subset_index[i]])
evaluation_parameters.append(parameters[subset_index[i]][0])
forest = RandomForestClassifier(n_estimators=1000)
forest.fit(train_parameters, train_categories)
category_array = np.asarray(train_categories)
scores = cross_val_score(forest, train_parameters, category_array, cv=20)
print("Accuracy: %0.2f (+/- %0.2f)"
% (scores.mean(), scores.std()*2))
print '\nFeature importances:'
assert len(parameter_names) == len(forest.feature_importances_)
for i in range(len(parameter_names)):
print parameter_names[i].ljust(18), round(forest.feature_importances_[i], 3)
evaluations = []
for parameter_list in evaluation_parameters:
evaluations.append(forest.predict(parameter_list))
assert len(evaluation_categories) == len(evaluations)
eval_right = 0
for i in range(len(evaluations)):
if str(evaluation_categories[i]) == str(evaluations[i][0]):
eval_right += 1
print "\n"
print "Evaluation set: %d out of %d correct (%d percent)" % (eval_right, len(evaluations),
int(eval_right * 100.0 / len(evaluations)))
pickle.dump( forest, open( "forest.pickle", "wb+" ) )
In [22]:
categories = json.loads(open('categories.json', 'r').read())
x = pd.Series(categories)
x.unique()
Out[22]:
In [25]:
###
### 6. Build results with predicted "undetermined" data
###
# concatenate results files into one
results_all = {}
for i in range(10):
results_all = dict(results_all.items() + json.loads(open('result_dict'+str(i)+'.json', 'r').read()).items())
with open('results_all.json', 'w+') as f:
f.write(json.dumps(results_all))
training_params = json.loads(open('training_params.json', 'r').read())
reject_params = json.loads(open('reject_params.json', 'r').read())
machine_params = json.loads(open('machine_params.json', 'r').read())
all_params = dict(training_params.items() + reject_params.items() + machine_params.items())
# replace NaN with -999
for key in training_params.keys():
for j in range(len(training_params[key][0])): #for some reason, it's a list inside a list
if np.isnan(training_params[key][0][j]):
training_params[key][0][j] = -999
for key in reject_params.keys():
for j in range(len(reject_params[key][0])):
if np.isnan(reject_params[key][0][j]):
reject_params[key][0][j] = -999
for key in machine_params.keys():
for j in range(len(machine_params[key][0])):
if np.isnan(machine_params[key][0][j]):
machine_params[key][0][j] = -999
# make pandas dataframe to hold results
dffinal = pd.DataFrame()
for filename in results_all.keys():
for rawname in results_all[filename].keys():
dffinal = dffinal.append(pd.DataFrame({'filename':[filename], 'raw_col':[rawname],
'original_cat':[str(results_all[filename][rawname])]}),
ignore_index=True)
dffinal.reset_index(drop=True, inplace=True)
# populate new results dicts from old, with forest prediction
labels = ['battery_voltage', 'bimodal', 'humidity', 'precipitation', 'snow_24',
'snow_total', 'temperature', 'wind_avg', 'wind_dir', 'wind_max', 'wind_min']
forest = pickle.load( open( "forest.pickle", "rb" ) )
# do machine learning prediction and build dict without numeric or "_duplicate" suffixes
dffinal['initial_col'] = ''
dffinal['final_col'] = ''
dffinal['machine_prob'] = ''
for pn in parameter_names:
dffinal[pn] = ''
for idx, row in dffinal.iterrows():
concatname = dffinal.filename[idx] + '__' + dffinal.raw_col[idx]
origcol = dffinal.original_cat[idx]
if origcol[:12] == 'undetermined':
dffinal.initial_col[idx] = 'undetermined'
dffinal.final_col[idx] = str(list(forest.predict(machine_params[concatname]))[0])
dffinal.machine_prob[idx] = int(forest.predict_proba(machine_params[concatname]).max()*100)
elif origcol[:14] == 'undeterminable':
dffinal.initial_col[idx] = 'undeterminable'
dffinal.final_col[idx] = 'undeterminable'
else:
found = False
for label in labels:
if origcol[:len(label)] == label:
dffinal.initial_col[idx] = label
dffinal.final_col[idx] = label
found = True
if found == False:
raise Exception('Label not found')
# number duplicate columns
dffinal.sort(['filename', 'final_col'], ascending=True, inplace=True)
dffinal['final_numbered'] = dffinal.final_col
filenames = dffinal.filename.unique()
for filename in filenames:
counter_total = {}
counter = {}
for idx, row in dffinal[dffinal.filename == filename].iterrows():
increment_dict(counter_total, dffinal.final_col[idx])
for idx, row in dffinal[dffinal.filename == filename].iterrows():
colname = dffinal.final_col[idx]
if counter_total[colname] > 1 or colname == 'undeterminable':
increment_dict(counter, colname)
suffixnum = counter[colname]
suffixtxt = str(suffixnum)
if suffixnum < 10:
suffixtxt = '0' + suffixtxt
dffinal.final_numbered[idx] = colname + "_" + suffixtxt
# add parameters
for idx, row in dffinal.iterrows():
concatname = dffinal.filename[idx] + '__' + dffinal.raw_col[idx]
for i in range(len(parameter_names)):
dffinal[parameter_names[i]][idx] = all_params[concatname][0][i]
# somehow it ended up being a list within a list, hence the [0]
dffinal.to_pickle('dffinal.pickle')
dffinal.to_csv('categories_and_parameters.csv')
In [26]:
###
### 7. WRITE CSVs WITH REPLACED COLUMN NAMES
###
dffinal = pd.read_pickle('dffinal.pickle')
col_list = dffinal.final_numbered.unique()
for pickle_num in range(1,10):
pickle_name = 'wxdatamunged_' + str(pickle_num) + '.pickle'
out_prefix = 'wxdata_categorized_' + str(pickle_num)
start=time.time()
dfin = pd.read_pickle(pickle_name)
filenames = list(dfin.FileName.unique())
dfout = pd.DataFrame(columns=col_list)
for filename in filenames:
dftemp = dfin.copy()
dftemp = dftemp[dftemp.FileName == filename]
sub_list = []
for item in dftemp.columns:
if item[:3] == 'raw' and int(item[-2:]) >= 6:
sub_list.append(dffinal[(dffinal.filename == filename) & (dffinal.raw_col == item)].final_numbered.iloc[0])
else:
sub_list.append(item)
dftemp.columns = sub_list
dfout = pd.concat([dfout, dftemp])
dfout.to_pickle(out_prefix+'.pickle')
dfout.to_csv(out_prefix+'.csv')
In [ ]: