In [1]:
###
### 1. IMPORT MODULES
###

import pandas as pd
import time
import os
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from random import shuffle
import pickle
import copy

def print_elapsed_time():
    print "%0.1f minutes elapsed since 'start' variable reset" % ((time.time() - start)/60)
    
def float_proportion(thelist):
    counter = 0
    try:
        for item in thelist:
            if item != int(item):
                counter += 1
        return counter * 1.0 / len(thelist)
    except:
        return -1
    
def increment_dict(d, key, increment=1):
    if key in d.keys():
        d[key] += increment
    else:
        d[key] = increment
    return d

parameter_names = ['min',
                       'median',
                       'max',
                       'interquartile',
                       'stdev',
                       'float_prop',
                       'winter_corr',
                       'day_corr',
                       'jan_median',
                       'jul_median',
                       'zero_prop',
                       'noon_midn_med_dif']

In [ ]:
###
### 2. SPLIT BIG CSV INTO 10 SMALLER CSVs
###

# nb without splitting contiguous values in the filenames field

do_split = False # since this is a lengthy process that only has to be done once,
                 # by default it is switched off

if do_split == True:

    def split_file(filepath, lines=numlines):
    """Split a file based on a number of lines, after a change in the first field of a csv. Presumes there is always at least one 
    field change between each block of lines."""
    counter = 0
    path, filename = os.path.split(filepath)
    # filename.split('.') would not work for filenames with more than one .
    basename, ext = os.path.splitext(filename)
    # open input file
    with open(filepath, 'r') as f_in:
        try:
            # open the first output file
            f_out = open(os.path.join(path, '{}_{}{}'.format(basename, counter, ext)), 'w')
            status = 'incomplete'
            # loop over all lines in the input file, and number them
            for i, line in enumerate(f_in):
                if i == 0:
                    firstline = line
                if status == 'waiting for field change':
                    if line.strip().split(',')[0] != field:
                        f_out.close()
                        counter += 1
                        print counter
                        f_out = open(os.path.join(path, '{}_{}{}'.format(basename, counter, ext)), 'w')
                        status = 'incomplete'
                        f_out.write(firstline)
                if (i > 5 and i % lines == 0):
                    status = 'waiting for field change'
                    field = line.strip().split(',')[0]
                # write the line to the output file
                if i <> 1: # remove second line of file, which is not data
                    f_out.write(line)
        finally:
            # close the last output file
            f_out.close()

    split_file("C:/Users/David/Documents/IPython Notebooks/elancematthew/wxdatarawclean.csv", 567000)

In [15]:
### 
### 3. ADD CALCULATED FIELDS/COLUMNS TO INPUT CSV
###

# added columns:

#     datetime_iso     YYYY-MM-DD-HHMM      Calculated field so data and time are sortable
#     winterness       0 to 6               Number of months away from July; maximum is January = 6, Dec. & Feb. = 5, etc.
#                                           Used to estimate seasonal trends
#     dayness          0 to 12              Number of hours away from midnight; noon = 12, 11am and 1pm = 11, etc.
#                                           Used to estimate daily trends

do_pickle = False   # since this is a lengthy process that only needs to be done once,
                    # by default it is switched off

if do_pickle == True:

    for filenum in range(10):
    
        osfilename = 'wxdatarawclean_' + str(filenum) + '.csv'
        picklename = 'wxdatamunged_' + str(filenum) + '.pickle'
        
        print osfilename
        
        dfin = pd.read_csv(osfilename)
        
        winterness_conversion = {1:6, 2:5, 12:5, 3:4, 11:4, 4:3, 10:3, 5:2, 9:2, 6:1, 8:1, 7:0}

        start = time.time()

        def add_iso(df):  
            try:
                parsed_year = df.Date[-4:]
                parsed_month = df.Date[0:2]
                parsed_day = df.Date[3:5]
                hour = df.Hour
                month = int(parsed_month)
                dayness = 12 - abs(int(hour/100) - 12)
                winterness = winterness_conversion[int(parsed_month)]

                # make sure there are no errors in the date and time fields
                assert 1930 < int(parsed_year) < 2015
                assert 1 <= int(parsed_month) <= 12
                assert 1 <= int(parsed_day) <= 31
                assert 0 <= hour <= 2400

                date_time = parsed_year + '-' + parsed_month + '-' + parsed_day + '-'

                if hour < 1000:         #add leading zeroes
                    date_time += '0'
                if hour < 10 :
                    date_time += '0'
                if hour == 0:
                    date_time += '0'
                date_time += str(hour)
                return pd.Series({'datetime_iso': date_time, 'winterness':winterness, 'month':month, 'dayness':dayness})
            except:
                return pd.Series({'datetime_iso': 'ERROR', 'winterness':3, 'month':6, 'dayness':6}) # median values used

        dfin = dfin.merge(dfin.apply(add_iso, axis=1), left_index=True, right_index=True)

        dfin.to_pickle(picklename)

        print_elapsed_time()
        print "If you are reading this in stdout, all dates and times were validated as being in correct range."

In [ ]:
# output columns:

#     FileName
#     datetime_iso     sortable YYYY-MM-DD-HHMM
#     battery_voltage  battery voltage (e.g. 1280 = 12.80 V)
#     upper_temp       upper elevation temperature (F)
#     mid_temp         mid elevation temperature (F)
#     lower_temp       lower elevation temperature (F)
#     humidity         relative humidity (%)
#     wind_min         wind speed minimum past hour (mph)
#     wind_avg         wind speed average past hour (mph)
#     wind_max         wind speed maximum past hour (mph)
#     wind_dir         wind direction vector average past hour (degrees from true north)
#     precip           precipitation past hour in hundredths (e.g. 5 = .05 inches)
#     snow_24          24 hour snow depth if available (inches)
#     snow_total       total snow depth if available (inches)

In [21]:
###
### 4. IDENTIFY 'LOW HANGING FRUIT', columns that are most easily identified
###

do_this_part = True

if do_this_part == True:
    process_all_pickles = True  # to run the whole script; otherwise it's in debug mode
    do_part = 9 #if it is -1, will process entire pickle; ignored if process_all_pickles == True
    
    ### ignored if process_all_pickles == True:
    picklename = "wxdatamunged_1.pickle"   # necessary because training set had different header capitalization
    # picklename = "dfin.pickle" # training set
    filename_formatted = 'FileName'
    # filename_formatted = 'filename' # training set

    
    reject_params = {}
    machine_params = {}
    training_params = {}
    
    parameters = []
    categories = []

    samples = []

    for pickle_number in range(10):

        if process_all_pickles == True or pickle_number == 0:

            if process_all_pickles == False:
                dfin = pd.read_pickle(picklename)
            else:
                dfin = pd.read_pickle("wxdatamunged_"+str(pickle_number)+".pickle")
                print pickle_number

            filenames = list(dfin[filename_formatted].unique())

            results = {}

            if process_all_pickles == True or do_part == -1:
                min_filenames = 0
                max_filenames = len(filenames)
            else:
                min_filenames = do_part
                max_filenames = do_part+1

            for filename_pos in range(min_filenames, max_filenames):
                current_filename = filenames[filename_pos]
                if process_all_pickles == False and do_part != -1:
                    print current_filename
                    print "Note: these are before column ids are possibly changed to bimodal or precipitation_duplicate to undetermined."
                temp_parameters = []
                temp_samples = []
                temp_categories = []

                dfsub = dfin[dfin[filename_formatted] == current_filename]
                dfsub.sort('datetime_iso')

                if picklename == "dfin.pickle": #training set
                    columns_to_id = ['raw06', 'raw07', 'raw08', 'raw09', 'raw10', 'raw11', 'raw12', 'raw13'] #, 'raw14', 'raw15', 'raw16']
                else:
                    columns_to_id = ['raw06', 'raw07', 'raw08', 'raw09', 'raw10', 'raw11', 'raw12', 'raw13',
                                     'raw14', 'raw15', 'raw16']

                columns_to_id = list(reversed(columns_to_id))

                # note that these counters were used to build the example table for approval
                # after that I considered them no longer useful as column header suffixes
                # but some are still used as initial categorization criteria
                # therefore due to time constraints instead of recoding,
                # they are all assigned as column header suffixes and then removed later
                
                undeterminable_counter = 0
                undetermined_counter = 0
                temperature_counter = 0
                precipitation_counter = 0
                decimal_precipitation_counter = 0
                wind_dir_counter = 0
                bimodal_counter = 0
                cols_to_skip = []

                column_ids = {}
                column_position = 0
                for column_position in range(len(columns_to_id)):
                    cur_col = columns_to_id[column_position]
                    colall = dfsub[cur_col]
                    colpos = dfsub[dfsub[cur_col] > -49][cur_col]

                    #### note, only values between 15th and 85th percentile are used, to eliminate outliers ####
                    
                    zero_prop = len(dfsub[dfsub[cur_col] == 0.0])/(1.0*(len(dfsub)+1))

                    #trim to remove outliers
                    remove_low = False
                    remove_high = False
                    if colpos.quantile(.15) > colpos.min():
                        remove_low = True
                    if colpos.quantile(.85) < colpos.max():
                        remove_high = False
                    if remove_high == True and remove_low == True:
                        colpos = colpos[(colpos>colpos.quantile(.15)) & (colpos<colpos.quantile(.85))]
                    elif remove_high == True:
                        colpos = colpos[colpos<colpos.quantile(.85)]
                    elif remove_low == True:
                        colpos = colpos[colpos>colpos.quantile(.15)]

                    #used only for windspeed
                    #note that other columns are only used in this step,
                    #not in machine learning, to avoid overfitting
                    if column_position+3 < len(columns_to_id):
                        colpos2 = dfsub[dfsub[columns_to_id[column_position+1]] > -49][columns_to_id[column_position+1]]

                    float_prop = float_proportion(list(colpos))
                    if 0.3 < float_prop < 0.7:
                        bimodal_counter += 1

                    if column_position not in cols_to_skip:
                        sdev = colpos.std()

                        # determine whether there are enough days and months in the sample to use this metric
                        use_winterness, use_dayness = False, False
                        if (dfsub[dfsub.winterness > -99]['winterness'].min() <= 1 and dfsub[dfsub.winterness > -99]['winterness'].max() >= 5):
                            use_winterness = True
                            winter_correlation = colpos.corr(dfsub[dfsub.winterness > -99]['winterness'], method='spearman')
                        else:
                            winter_correlation = -99
                        if (dfsub[dfsub.dayness > -99]['dayness'].min() <= 1 and dfsub[dfsub.dayness > -99]['dayness'].max() >= 11):
                            use_dayness = True
                            day_correlation = colpos.corr(dfsub[dfsub.dayness > -99]['dayness'], method='spearman')
                        else:
                            day_correlation = -99

                        # for snowfall
                        janmedian = dfsub[(dfsub[cur_col] > -49) & (dfsub.winterness == 6)][cur_col].median()
                        julmedian = dfsub[(dfsub[cur_col] > -49) & (dfsub.winterness == 0)][cur_col].median()

                        #for machine learning
                        noon_midn_med_dif = (dfsub[(dfsub[cur_col] > -49) & (dfsub.dayness == 12)][cur_col].median()- 
                                            dfsub[(dfsub[cur_col] > -49) & (dfsub.dayness == 0)][cur_col].median())

                        # if all values are -49 or less:
                        if len(colpos) == 0:
                            undeterminable_counter += 1
                            column_ids[cur_col] = 'undeterminable_' + str( undeterminable_counter )

                        # if more than 98 percent of values are -49 or less
                        elif colall.quantile(.98) <= -49:
                            undeterminable_counter += 1
                            column_ids[cur_col] = 'undeterminable_' + str(undeterminable_counter)

                        #evaluate whether it is battery voltage
                        # 20th and 80th percentile are between 5 and 15, 10 and 16
                        # standard devation < 0.1
                        # float proportion greater than .5
                        elif (5 <= colpos.min() < 15 and
                              10 <= colpos.max() < 16 and
                              sdev < 1 and
                              float_prop > 0.5):
                            column_ids[cur_col] = "battery_voltage"

                        # evaluate whether it is wind direction
                        # DEPRECATED criteria: min 0-10, max 350-360, median is within 15 degrees of average of 25 percentile and 75 percentile
                        #OR maximum between 359 and 360
                        # 
                        elif (column_position < len(columns_to_id) - 3 and
                              sdev > 10 and day_correlation < 0.1 and 90 < colpos.median() < 270 and
                              colpos.max() < 350):
                            wind_dir_counter += 1
                            if wind_dir_counter == 1:
                                column_ids[cur_col] = 'wind_dir'
                                if colpos2.std() > 0:
                                    column_ids[columns_to_id[column_position+1]] = 'wind_max'
                                    column_ids[columns_to_id[column_position+2]] = 'wind_avg'
                                    column_ids[columns_to_id[column_position+3]] = 'wind_min'
                                else: #column before wind_dir is nan; either following two columns or following
                                     #three columns are wind speeds
                                    if column_position < len(columns_to_id) - 4:
                                        switch = True
                                    else:
                                        switch = False
                                    if switch == True:
                                        colpos3 = dfsub[dfsub[columns_to_id[column_position+2]] > -49][columns_to_id[column_position+2]]        
                                        colpos4 = dfsub[dfsub[columns_to_id[column_position+3]] > -49][columns_to_id[column_position+3]]    
                                        colpos5 = dfsub[dfsub[columns_to_id[column_position+4]] > -49][columns_to_id[column_position+4]]
                                        if not (  0 <= (colpos5.quantile(.25) - colpos4.quantile(.25)) < 30 and
                                              0 <= (colpos4.quantile(.25) - colpos3.quantile(.25)) < 30 and
                                              0 < (colpos5.quantile(.5) - colpos4.quantile(.5)) < 30 and
                                              0 < (colpos4.quantile(.5) - colpos3.quantile(.5)) < 30 and
                                              0 < (colpos5.quantile(.75) - colpos4.quantile(.75)) < 30 and
                                              0 < (colpos4.quantile(.75) - colpos3.quantile(.75)) < 30):
                                            switch = False
                                    if switch == True:
                                        cols_to_skip.append(column_position+4)  #other three are below
                                        column_ids[columns_to_id[column_position+2]] = 'wind_max'
                                        column_ids[columns_to_id[column_position+3]] = 'wind_avg'
                                        column_ids[columns_to_id[column_position+4]] = 'wind_min'
                                    else: # only two following columns are windspeed    
                                        column_ids[columns_to_id[column_position+2]] = 'wind_max'
                                        column_ids[columns_to_id[column_position+3]] = 'wind_min'
                                    if columns_to_id[column_position+1] not in column_ids.keys(): #might have already been identified as bad data above
                                        undetermined_counter += 1
                                        column_ids[columns_to_id[column_position+1]] = 'undetermined_' + str(undetermined_counter)

                                cols_to_skip.append(column_position+1)
                                cols_to_skip.append(column_position+2)
                                cols_to_skip.append(column_position+3)
                            else:
                                column_ids[cur_col] = 'wind_dir_duplicate_' + str(wind_dir_counter)    
                                
                                
                        # evaluate whether it is snowfall

                        elif (janmedian > 30 and (np.isnan(julmedian) or julmedian == 0)):
                            column_ids[cur_col] = 'snow_total'
                        elif (janmedian > 1 and (np.isnan(julmedian) or julmedian == 0)):
                            column_ids[cur_col] = 'snow_24'
                            
                        # evaluate whether it is precipitation
                        elif zero_prop > 0.3:
                            precipitation_counter += 1
                            if 0 <= colpos.median() < 1:
                                decimal_precipitation_counter += 1
                                column_ids[cur_col] = 'precipitation_' + str(precipitation_counter)
                            else:
                                column_ids[cur_col] = 'precipitation_duplicate_' + str(precipitation_counter)
                            ### note the above code was 'hacked' from existing code; here's what happens:
                            # if columns is as yet uncategorized and if more than 30% of values = 0
                            # if median < 1, marked "precipitation_1", "precipitation_2", etc. (never just "precipitation")
                            # otherwise (median >-1), marked "precipitation_duplicate_1", etc.
                            # so they will contain suffix numbers and the word 'duplicate' even if they are not a duplicate
                            # this doesn't matter, as these get routinely sliced off later.
                            # it just serves to identify "definite" precipitation with less definite ones.
                            # it's inelegant, but it's a quick fit

                        # evaluate whether it is temperature
                        elif (use_winterness == True and
                              use_dayness == True and
                              colpos.max() < 100 and
                              0 < colpos.median() < 70 and
                              -0.9 <= winter_correlation <= 0.05 and
                              -0.1 <= day_correlation <= 0.5 and sdev > 0.1):
                            temperature_counter += 1
                            column_ids[cur_col] = 'temperature_' + str(temperature_counter)

                        # evaluate whether it is humidity
                        elif ((97 <= colpos.max() <= 100 or (colpos.quantile(.25) > 30 and
                            colpos.max() <= 100)) and
                            -0.2 < winter_correlation < 0.75 and
                            -0.8 < day_correlation < 0.1):
                            column_ids[cur_col] = 'humidity'



                        elif use_winterness == False or use_dayness == False:
                            undeterminable_counter += 1
                            column_ids[cur_col] = 'undeterminable_' + str( undeterminable_counter)

                        else:
                            undetermined_counter += 1
                            column_ids[cur_col] = 'undetermined_' + str( undetermined_counter)

                    if column_ids[cur_col][:8] != 'undeterm':
                        if np.isnan(julmedian):
                            julmedian= -47
                        category_col = column_ids[cur_col]
                        category_col = category_col.replace('_24', '_xxx')
                        for i in range(1,9):
                            category_col = category_col.replace('_'+str(i), '')
                            category_col = category_col.replace('_duplicate', '')
                        category_col = category_col.replace('_xxx', '_24')
                        key = current_filename+"__"+cur_col
                        temp_samples.append(key)
                        temp_categories.append(category_col)
                        templist = []
                        templist.append([colpos.min(),
                                        colpos.median(),
                                        colpos.max(),
                                        colpos.quantile(.75) - colpos.quantile(.25),
                                        sdev,
                                        float_prop,
                                        winter_correlation,
                                        day_correlation,
                                        janmedian,
                                        julmedian,
                                        zero_prop,
                                        noon_midn_med_dif])
                        temp_parameters.append(templist)
                        training_params[key] = templist
                        
                    if column_ids[cur_col][:14] == 'undeterminable':
                        if np.isnan(julmedian):
                            julmedian= -47
                        key = current_filename+"__"+cur_col
                        templist = []
                        templist.append([colpos.min(),
                                        colpos.median(),
                                        colpos.max(),
                                        colpos.quantile(.75) - colpos.quantile(.25),
                                        sdev,
                                        float_prop,
                                        winter_correlation,
                                        day_correlation,
                                        janmedian,
                                        julmedian,
                                        zero_prop,
                                        noon_midn_med_dif])
                        reject_params[key] = templist
                        
                    if column_ids[cur_col][:12] == 'undetermined':
                        if np.isnan(julmedian):
                            julmedian= -47
                        key = current_filename+"__"+cur_col
                        templist = []
                        templist.append([colpos.min(),
                                        colpos.median(),
                                        colpos.max(),
                                        colpos.quantile(.75) - colpos.quantile(.25),
                                        sdev,
                                        float_prop,
                                        winter_correlation,
                                        day_correlation,
                                        janmedian,
                                        julmedian,
                                        zero_prop,
                                        noon_midn_med_dif])
                        machine_params[key] = templist

                    if len(column_ids) > len(columns_to_id):
                            print "Column(s) missing"
                            print column_ids
                
                    if process_all_pickles == False and do_part != -1: #review one filename
                        print cur_col.ljust(6),
                        print ("%0.1f" % (colpos.min())).rjust(5),
                        print ("%0.1f" % (colpos.quantile(.25))).rjust(5),
                        print ("%0.1f" % (colpos.quantile(.50))).rjust(5),
                        print ("%0.1f" % (colpos.quantile(.75))).rjust(5),
                        print ("%0.1f" % (colpos.max())).rjust(5),
                        print ("%0.1f" % (colpos.mean())).rjust(5),
                        print ("%0.3f" % (winter_correlation)).rjust(7),
                        print ("%0.3f" % (day_correlation)).rjust(7),
                        print ("%0.3f" % (zero_prop)).rjust(7),
                        print ("%0.3f" % (sdev)).rjust(7),
                        print ("%0.3f" % (float_prop)),
                        #print ' '
                        print column_ids[cur_col]
                
                if decimal_precipitation_counter > 0:
                    for key in column_ids.keys():
                        if column_ids[key].find('precipitation_duplicate') != -1:
                            column_ids[key] = 'undetermined' # no suffix number given or needed because they ended up not being used
                        
                if bimodal_counter >= 3:
                    for key in column_ids.keys():
                        column_ids[key] = 'bimodal'       
                
                for position in range(len(temp_samples)):
                    concatname = temp_samples[position]
                    part1 = temp_samples[position][:-7]
                    part2 = temp_samples[position][-5:]
                    if column_ids[part2][:12] == 'undetermined':
                        tempvalue = training_params[concatname] 
                        del training_params[concatname]
                        machine_params[concatname] = tempvalue
                        temp_samples[position] = 'XXXXXXXXXXXX' #not very elegant, I know...
                        temp_categories[position] = 'XXXXXXXXXXXX'
                        temp_parameters[position] = 'XXXXXXXXXXXX'
                    if column_ids[part2][:7] == 'bimodal':
                        temp_categories[position] = 'bimodal'
                
                temp_samples = [x for x in temp_samples if x != 'XXXXXXXXXXXX']
                temp_categories = [x for x in temp_categories if x != 'XXXXXXXXXXXX']
                temp_parameters = [x for x in temp_parameters if x != 'XXXXXXXXXXXX']
                
                for position in range(len(temp_categories)): #brute force removal of a bug that was resulting in strange category names
                    if temp_categories[position][:6] == 'precip':
                        temp_categories[position] = "precipitation"
                
                samples += temp_samples
                categories += temp_categories
                parameters += temp_parameters

                results[current_filename] = column_ids

        if process_all_pickles == True:
            suffix = str(pickle_number)+'.json'
            with open('result_dict'+suffix, 'w+') as f:
                f.write(json.dumps(results))

    if process_all_pickles == True:
        with open('categories.json', 'w+') as f:
            f.write(json.dumps(categories))
        with open('samples.json', 'w+') as f:
            f.write(json.dumps(samples))
        with open('parameters.json', 'w+') as f:
            f.write(json.dumps(parameters))
        with open('reject_params.json', 'w+') as f:
            f.write(json.dumps(reject_params))
        with open('machine_params.json', 'w+') as f:
            f.write(json.dumps(machine_params))
        with open('training_params.json', 'w+') as f:
            f.write(json.dumps(training_params))


0
1
2
3
4
5
6
7
8
9

In [59]:
# auxiliary script to print median per month, then quantiles for a single column
# will only work if in above block, process_all_pickles == False and do_part != -1
# (if do_part = -1, it will work, but will show the last filename in the block)
# (if process_all_pickles == False, it will show the last filename in the entire dataset)

print pickle_number, current_filename
cc = 'raw10'
for i in range(1, 13):
    print cc, i,
    print dfsub[(dfsub[cc] > -49) & (dfsub.month == i)][cc].median()

for i in range(1, 13):
    print cc, i,
    print dfsub[(dfsub[cc] > -49) & (dfsub.dayness == i)][cc].median()    

for i in range(2, 100, 2):
    print str(i).rjust(3),
    print str(dfsub[cc].quantile(i/100.0)).ljust(8)

x = dfsub[(dfsub[cc] > -49)][cc]


9 WAS0708.DAT
raw10 1 16.315
raw10 2 25.09
raw10 3 23.505
raw10 4 31.75
raw10 5 81.2
raw10 6 69.535
raw10 7 59.23
raw10 8 51.34
raw10 9 45.445
raw10 10 32.66
raw10 11 60.7
raw10 12 54.775
raw10 1 40.365
raw10 2 40.025
raw10 3 39.82
raw10 4 39.75
raw10 5 40.15
raw10 6 41.465
raw10 7 43.49
raw10 8 46.095
raw10 9 48.805
raw10 10 50.415
raw10 11 52.91
raw10 12 53.84
  2 11.0128 
  4 13.0428 
  6 14.48   
  8 15.83   
 10 17.18   
 12 18.722  
 14 20.0    
 16 21.2112 
 18 22.28   
 20 23.46   
 22 24.6304 
 24 25.8068 
 26 26.75   
 28 27.99   
 30 29.192  
 32 30.4924 
 34 31.6076 
 36 32.42   
 38 33.56   
 40 34.998  
 42 36.4744 
 44 38.05   
 46 39.89   
 48 41.66   
 50 43.46   
 52 45.5164 
 54 47.2584 
 56 49.19   
 58 51.04   
 60 52.98   
 62 54.7    
 64 56.164  
 66 57.8236 
 68 59.22   
 70 60.504  
 72 61.57   
 74 62.7468 
 76 64.09   
 78 65.44   
 80 67.07   
 82 68.9    
 84 70.6    
 86 72.7    
 88 75.0    
 90 77.3    
 92 79.544  
 94 82.4    
 96 86.7    
 98 92.9    

In [24]:
###
### 5. PERFORM MACHINE LEARNING (RANDOM FOREST METHOD) TO CLASSIFY UNDETERMINED COLUMNS
###

do_this_part = True

if do_this_part == True:

    categories = json.loads(open('categories.json', 'r').read())
    samples = json.loads(open('samples.json', 'r').read())
    parameters = json.loads(open('parameters.json', 'r').read())

    # replace NaN with -999
    for i in range(len(parameters)):
        for j in range(len(parameters[i][0])):  # the [0] is to hack a bug that enclosed lists 3 deep instead of the intended 2
            if np.isnan(parameters[i][0][j]):
                parameters[i][0][j] = -999

    # run 90% of data as training set, `0% as evaluation

    subset_index = range(len(parameters))
    shuffle(subset_index)
    training_set_length = int(0.9 * len(parameters))

    evaluation_index = subset_index[training_set_length:]
    train_index = subset_index[:training_set_length]            

    train_samples = []
    train_categories = []
    train_parameters = []

    for i in range(len(train_index)):
        train_samples.append(samples[subset_index[i]])
        train_categories.append(categories[subset_index[i]])
        train_parameters.append(parameters[subset_index[i]][0])

    evaluation_samples = []
    evaluation_categories = []
    evaluation_parameters = []

    for i in range(len(evaluation_index)):
        evaluation_samples.append(samples[subset_index[i]])
        evaluation_categories.append(categories[subset_index[i]])
        evaluation_parameters.append(parameters[subset_index[i]][0])

    forest = RandomForestClassifier(n_estimators=1000)
    forest.fit(train_parameters, train_categories)

    category_array = np.asarray(train_categories)

    scores = cross_val_score(forest, train_parameters, category_array, cv=20)
    print("Accuracy: %0.2f (+/- %0.2f)"
          % (scores.mean(), scores.std()*2))

    print '\nFeature importances:'
    assert len(parameter_names) == len(forest.feature_importances_)
    for i in range(len(parameter_names)):
        print parameter_names[i].ljust(18), round(forest.feature_importances_[i], 3)

    evaluations = []
    for parameter_list in evaluation_parameters:
        evaluations.append(forest.predict(parameter_list))

    assert len(evaluation_categories) == len(evaluations)

    eval_right = 0
    for i in range(len(evaluations)):
        if str(evaluation_categories[i]) == str(evaluations[i][0]):
            eval_right += 1

    print "\n"
    print "Evaluation set: %d out of %d correct (%d percent)" % (eval_right, len(evaluations), 
                                           int(eval_right * 100.0 / len(evaluations)))

    pickle.dump( forest, open( "forest.pickle", "wb+" ) )


Accuracy: 0.91 (+/- 0.03)

Feature importances:
min                0.054
median             0.141
max                0.09
interquartile      0.078
stdev              0.087
float_prop         0.034
winter_corr        0.041
day_corr           0.026
jan_median         0.19
jul_median         0.153
zero_prop          0.074
noon_midn_med_dif  0.032


Evaluation set: 378 out of 381 correct (99 percent)

In [22]:
categories = json.loads(open('categories.json', 'r').read())
x = pd.Series(categories)
x.unique()


Out[22]:
array([u'snow_total', u'precipitation', u'wind_dir', u'wind_max',
       u'wind_avg', u'wind_min', u'humidity', u'snow_24', u'temperature',
       u'battery_voltage', u'bimodal'], dtype=object)

In [25]:
###
### 6. Build results with predicted "undetermined" data
###

# concatenate results files into one
results_all = {}
for i in range(10):
   results_all = dict(results_all.items() + json.loads(open('result_dict'+str(i)+'.json', 'r').read()).items())
with open('results_all.json', 'w+') as f:
    f.write(json.dumps(results_all))
    
training_params = json.loads(open('training_params.json', 'r').read())
reject_params = json.loads(open('reject_params.json', 'r').read())
machine_params = json.loads(open('machine_params.json', 'r').read())

all_params = dict(training_params.items() + reject_params.items() + machine_params.items())

# replace NaN with -999
for key in training_params.keys():
    for j in range(len(training_params[key][0])): #for some reason, it's a list inside a list
        if np.isnan(training_params[key][0][j]):
            training_params[key][0][j] = -999
for key in reject_params.keys():
    for j in range(len(reject_params[key][0])):
        if np.isnan(reject_params[key][0][j]):
            reject_params[key][0][j] = -999
for key in machine_params.keys():
    for j in range(len(machine_params[key][0])):
        if np.isnan(machine_params[key][0][j]):
            machine_params[key][0][j] = -999

# make pandas dataframe to hold results
dffinal = pd.DataFrame()
for filename in results_all.keys():
    for rawname in results_all[filename].keys():
        dffinal = dffinal.append(pd.DataFrame({'filename':[filename], 'raw_col':[rawname], 
                                               'original_cat':[str(results_all[filename][rawname])]}),
                                                ignore_index=True)

dffinal.reset_index(drop=True, inplace=True)

# populate new results dicts from old, with forest prediction

labels = ['battery_voltage', 'bimodal', 'humidity', 'precipitation', 'snow_24', 
          'snow_total', 'temperature', 'wind_avg', 'wind_dir', 'wind_max', 'wind_min']

forest = pickle.load( open( "forest.pickle", "rb" ) )

# do machine learning prediction and build dict without numeric or "_duplicate" suffixes

dffinal['initial_col'] = ''
dffinal['final_col'] = ''
dffinal['machine_prob'] = ''

for pn in parameter_names:
    dffinal[pn] = ''

for idx, row in dffinal.iterrows():
    concatname = dffinal.filename[idx] + '__' + dffinal.raw_col[idx]
    origcol = dffinal.original_cat[idx]
    if origcol[:12] == 'undetermined':
        dffinal.initial_col[idx] = 'undetermined'
        dffinal.final_col[idx] = str(list(forest.predict(machine_params[concatname]))[0])
        dffinal.machine_prob[idx] = int(forest.predict_proba(machine_params[concatname]).max()*100)
    elif origcol[:14] == 'undeterminable':
        dffinal.initial_col[idx] = 'undeterminable'
        dffinal.final_col[idx] = 'undeterminable'
    else:
        found = False
        for label in labels:
            if origcol[:len(label)] == label:
                dffinal.initial_col[idx] = label
                dffinal.final_col[idx] = label
                found = True
        if found == False:
            raise Exception('Label not found')

# number duplicate columns
            
dffinal.sort(['filename', 'final_col'], ascending=True, inplace=True)

dffinal['final_numbered'] = dffinal.final_col

filenames = dffinal.filename.unique()

for filename in filenames:
    counter_total = {}
    counter = {}
    for idx, row in dffinal[dffinal.filename == filename].iterrows():
        increment_dict(counter_total, dffinal.final_col[idx])
    for idx, row in dffinal[dffinal.filename == filename].iterrows():
        colname = dffinal.final_col[idx]
        if counter_total[colname] > 1 or colname == 'undeterminable':
            increment_dict(counter, colname)
            suffixnum = counter[colname]
            suffixtxt = str(suffixnum)
            if suffixnum < 10:
                suffixtxt = '0' + suffixtxt
            dffinal.final_numbered[idx] = colname + "_" + suffixtxt
            
# add parameters

for idx, row in dffinal.iterrows():
    concatname = dffinal.filename[idx] + '__' + dffinal.raw_col[idx]
    for i in range(len(parameter_names)):
        dffinal[parameter_names[i]][idx] = all_params[concatname][0][i]
            # somehow it ended up being a list within a list, hence the [0]
            
dffinal.to_pickle('dffinal.pickle')
dffinal.to_csv('categories_and_parameters.csv')

In [26]:
###
### 7. WRITE CSVs WITH REPLACED COLUMN NAMES
###

dffinal = pd.read_pickle('dffinal.pickle')

col_list = dffinal.final_numbered.unique()

for pickle_num in range(1,10):
    pickle_name = 'wxdatamunged_' + str(pickle_num) + '.pickle'
    out_prefix = 'wxdata_categorized_' + str(pickle_num)

    start=time.time()

    dfin = pd.read_pickle(pickle_name)
    filenames = list(dfin.FileName.unique())

    dfout = pd.DataFrame(columns=col_list)

    for filename in filenames:
        dftemp = dfin.copy()
        dftemp = dftemp[dftemp.FileName == filename]
        sub_list = []
        for item in dftemp.columns:
            if item[:3] == 'raw' and int(item[-2:]) >= 6:
                sub_list.append(dffinal[(dffinal.filename == filename) & (dffinal.raw_col == item)].final_numbered.iloc[0])
            else:
                sub_list.append(item)
        dftemp.columns = sub_list
        dfout = pd.concat([dfout, dftemp])
    
    dfout.to_pickle(out_prefix+'.pickle')
    dfout.to_csv(out_prefix+'.csv')

In [ ]: