First part of the data processing/preparation is performed on a Spark Data Frame. This is in principle not
necessary on a small dataset like the one under study.
Neverthless the Spark framework has been used since that would be the way to proceed in the case of a very large dataset distributed on (HDFS) cluster.
The tool used is PySpark.
The second part of data processing/preparation: characterization and modelling is performed on a Pandas Data Frame. After data cleaning it is advisable to perform data exploration and modelling on a small part of the data stored to on locally resident object. Tools used: Pandas, numpym scikit learn, ... The full pipeline can then be run on the distributed dataset using Spark MLLib package to implent the ML algorithm.
In [154]:
# write all outputs not only last one in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
In [2]:
# enable retina display quality for figures
%config InlineBackend.figure_format = 'retina'
In [3]:
# do not show warnings (turn on for final presentation)
import warnings
warnings.filterwarnings('ignore')
In [4]:
# platform setting
import os, sys, platform
platform.python_version()
Out[4]:
In [5]:
# some utility
import numpy as np
In [6]:
# Pandas module import and cofiguration
import pandas as pd
# avoid truncation when displaying long field values
pd.set_option('display.max_colwidth', -1)
# avoid column truncation when displaying high dimension data frames
pd.set_option('display.max_columns', None)
#
pd.options.display.max_rows = 999
# avoid long decimal printouts
pd.options.display.float_format = '{:,.5f}'.format
In [7]:
from pyspark import SparkContext, SQLContext, SparkConf
conf = SparkConf().setMaster("local").set("spark.driver.memory", "1g").set("spark.executor.memory", "1g")
sc = SparkContext(conf = conf)
In [8]:
#spark object entry point Spark 2.0, handle to a spark session
from pyspark.sql import SparkSession
spark = SparkSession(sc)
spark.version
Out[8]:
In [9]:
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.shuffle.partitions",2)
spark.conf.get("spark.sql.shuffle.partitions")
Out[9]:
Out[9]:
Load utiity functions and data types
In [10]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType, LongType, FloatType, DoubleType, TimestampType
In [11]:
# set path to input data
data_path = 'Data/zubie_trips_anonymous/'
In [12]:
# list local data directory
!ls -ltrh Data/zubie_trips_anonymous | grep .parquet | awk '{print $5, $9}'
In [13]:
# loop over files using HDFS filesystem interface as if data would be on cluster
# read data files and print row counts
hadoop = spark._jvm.org.apache.hadoop
fs = hadoop.fs.FileSystem
conf = hadoop.conf.Configuration()
path = hadoop.fs.Path(data_path)
for f in fs.get(conf).listStatus(path):
name = str(f.getPath())
if name.endswith('.parquet'):
counts = spark.read.parquet(name).count()
print ("{}, rows: {} ".format( name.split('/')[-1], counts) )
In [14]:
df = spark.read.parquet(data_path)
numOfFieldsRaw = len(df.columns)
numOfRows = df.count()
numOfRowsRaw = numOfRows
print ( "total num of fields: {}".format( numOfFieldsRaw ) )
print ( "total num of rows: {}".format( numOfRowsRaw ) )
In [15]:
print ( "Number of partitions: {}".format(df.rdd.getNumPartitions()) )
In [16]:
print ( " Data Frame Schema: " )
df.printSchema()
In [17]:
df.limit(10).toPandas()
Out[17]:
In [18]:
# Selected columns to inspect and eventually to drop
cols = ['user','start_point_place','end_point_place','mpg_combined','fuel_consumed','fuel_cost']
# count number of NA
for col in cols:
print ( "field: \'{}\', Number of non NULL entries: {} "
.format( col, df.select(col).dropna().count() )
)
# 'tags' field contains an array, let's expand it before counting for missing values
print ( "field: \'tags\', Number of non NULL entries: {}"
.format( df.select(F.explode('tags')).dropna().count() )
)
# append to the list of columns
cols.append('tags')
# apply drop function iteratively on selected columns
from functools import reduce
from pyspark.sql import DataFrame
df = reduce(DataFrame.drop, cols, df)
print ("\n Columns have been dropped ")
In [19]:
# transfom Spark DF into Pandas DF.
# (for large distributed dataset it would be necessary to sample only a franction
# of it before)
# Keep only duplicated rows
# Select a single vehicle_nickname, order by date
# Show table
pdf = df.toPandas()
pdf[ pdf.duplicated() ]\
[ pdf['vehicle_nickname']==pdf['vehicle_nickname'].values[0] ]\
.set_index('start_point_timestamp')\
.sort_index()\
.head(10)
Out[19]:
In [20]:
df = df.dropDuplicates()
print( "Number of unique rows/total {}/{}".format( df.count(), numOfRows ) )
numOfRows = df.count()
In [21]:
# get single field NA counts
for c in df.columns:
n = df.where( F.col(c).isNull() ).count()
print ("\'{}\' -- null counts/total: {}/{}".format(c, n, numOfRows))
In [22]:
# drop all rows with one or more NA field
df = df.dropna(how='any')
print( "Number of non-NA rows/total: {}/{}".format( df.count(), numOfRows ) )
numOfRows = df.count()
In [23]:
# extract a few rows to show URL
df.select('static_map_url').limit(5).toPandas() #show(5,truncate=False)
import requests
# sample 10% of the rows and get status code request (retain on 5 for printing)
for url in [ c['static_map_url'] for c in df.sample(False,0.1,seed=0).limit(5).rdd.collect() ]:
print (url)
code = requests.get(url).status_code
print (' satus code {}\n'.format( code ))
# drop column
df = df.drop('static_map_url')
Out[23]:
In [24]:
# list how many distinct units exists for distance,speed and fuel_consumed
df.select('distance_um').distinct().show()
df.select('speed_um').distinct().show()
df.select('fuel_consumed_um').distinct().show()
# list how many different combination exists for currency, symbol and price per gallon of fuel.
df.select('fuel_cost_currency_code','fuel_cost_currency_symbol','fuel_ppg').distinct().show()
# drop fuel ralted fields since there no observations are available (e.g.: fuel_consumed is empty)
df = df.drop('fuel_cost_currency_code','fuel_cost_currency_symbol','fuel_ppg','fuel_consumed_um')
In [25]:
# get stat descritption of
df.select('top_speed','top_speed_mph').describe().show()
df.select('obd_distance','obd_miles').describe().show()
df.select('gps_distance','gps_miles').describe().show()
# remove field with names specifying the unit since this is stored in a separate field
# and unit uniformity has been already ensured
df = df.drop('top_speed_mph','obd_miles','gps_miles')
In [26]:
#
df.select('trip_segments').distinct().show()
df.select('fuel_type').distinct().show()
#
df = df.drop('trip_segments','fuel_type')
In [27]:
# list of different countries
df.select('start_point_address_country').distinct().show()
df.select('end_point_address_country').distinct().show()
# drop since they are constant fields
df = df.drop('start_point_address_country','end_point_address_country')
- There is a "suspicious" name: 'Cymru'
- Cymru is the Welsh name of Wales
In [28]:
# list of different states
df.select('start_point_address_state').distinct().show()
df.select('end_point_address_state').distinct().show()
# list of start-end point state combinations
df.select('start_point_address_state','end_point_address_state').distinct().show()
- Inspect rows with 'Cymru' (only one actually) and then change the name to Wales
to get consistent naming
In [29]:
print (" Num of records with with state adress = Cymru: {} "
.format( df.filter(df['start_point_address_state']=='Cymru').count() )
)
# look at the data
df.filter(df['start_point_address_state']=='Cymru')\
.toPandas().head()
# convert name
df = df.replace('Cymru','Wales')
Out[29]:
In [30]:
# get occurencies of start/end point time zone combinations
# only BST-BST and GMT-GMT. Keep only one of the two
df.groupBy('start_point_timestamp_tz').pivot('end_point_timestamp_tz').count().show()
# get occurencies of timezone/daylight_saving_time_flag
# since pairing is consitent no fix is needed and the field can be removed
df.groupBy('start_point_timestamp_tz').pivot('start_point_daylight_saving_time_flag').count().show()
df.groupBy('end_point_timestamp_tz').pivot('end_point_daylight_saving_time_flag').count().show()
df = df.drop('start_point_daylight_saving_time_flag','end_point_daylight_saving_time_flag')\
.drop('end_point_timestamp_tz')
# rename for field name length more similar to a (3-character) field values
df = df.withColumnRenamed('start_point_timestamp_tz','trip_tz')
In [31]:
# define funciton for type casting a list of columns to pyspark.sql.types=Timestamp
def colsToTimestamp(df,colnames,format):
for name in colnames:
df = df.withColumn(name, F.unix_timestamp(df[name],timestamp_format).cast(TimestampType()))
return df
timestamp_format = 'yyyy-MM-dd HH:mm:ss'
#fields to be converted
timestamp_cols = ['start_point_timestamp','start_point_timestamp_utc','end_point_timestamp','end_point_timestamp_utc']
df = colsToTimestamp(df,timestamp_cols,timestamp_format)
# print checks
print (" check types schema: ")
print ( [(c.name,c.dataType) for c in df.schema if c.name in timestamp_cols] )
# display some rows
df.limit(5).toPandas().head()
Out[31]:
In [32]:
# keep only October
# group to get one entry per day
# order
col = 'start_point_timestamp'
df.select(col,'trip_tz' )\
.filter(F.month(col)==10)\
.groupBy(F.year(col),F.month(col),F.dayofmonth(col),'trip_tz')\
.count()\
.orderBy(F.year(col),F.month(col),F.dayofmonth(col))\
.toPandas().head(31)
Out[32]:
In [33]:
# select timestamp-type columns
timing_cols = [n for n in df.columns if 'timestamp' in n]
# add necessary info for the calculations
timing_cols.append('trip_tz')
timing_cols.append('duration_seconds')
# re-compute trip duration as timestamps difference for a few entries
# and show results
df.select( [c for c in timing_cols] ) \
.withColumn('mydt',F.unix_timestamp(df['end_point_timestamp_utc'])-F.unix_timestamp(df['start_point_timestamp_utc'])) \
.limit(5).toPandas().head() #(.show(5,truncate=False)
# re-compute for the whole dataset to check aggregated stats
# as resonable proof that the two calculations return always same result
df.select( [c for c in timing_cols] ) \
.withColumn('mydt',F.unix_timestamp(df['end_point_timestamp_utc'])-F.unix_timestamp(df['start_point_timestamp_utc'])) \
.describe('duration_seconds','mydt') \
.show()
# drop field for now not necessary
df = df.drop('start_point_timestamp_utc','end_point_timestamp_utc')
Out[33]:
In [34]:
#
df = df.withColumnRenamed('start_point_latitude','start_point_lat') \
.withColumnRenamed('start_point_longitude','start_point_lon') \
.withColumnRenamed('end_point_latitude','end_point_lat') \
.withColumnRenamed('end_point_longitude','end_point_lon')
In [35]:
#
latlong_cols = ['start_point_lat','start_point_lon','end_point_lat','end_point_lon']
timestamp_cols = [n for n in df.columns if 'timestamp' in n]
distance_cols = ['gps_distance','obd_distance']
duration_cols = ['duration_seconds','idle_seconds']
In [36]:
# useful column for the study
cols = timestamp_cols + duration_cols + latlong_cols + ['gps_distance']
#
print (" number of rows where idle time >= trip duration: {}"\
.format( df.where(df['idle_seconds'] >= df['duration_seconds']).count() )
)
#
print (" number of rows where idle time < 0: {}" \
.format( df.where(df['idle_seconds'] < 0).count() )
)
# display data for direct inspection
print(' \n Entries with idle > duration')
df.select(cols)\
.where( df['idle_seconds'] >= df['duration_seconds'] )\
.toPandas()
print(' Entries with idle < 0 ')
df.select(cols)\
.where( df['idle_seconds'] < 0 )\
.toPandas() #.show(truncate=False)
# remove inconsistent entries
df = df.where( (df['duration_seconds'] > df['idle_seconds']) & (df['idle_seconds'] >= 0) )
Out[36]:
Out[36]:
In [37]:
print ("rows after selection/total: {}/{}".format( df.count(), numOfRows) )
numOfRows = df.count()
In [38]:
df.select(latlong_cols)\
.describe()\
.show()
In [39]:
df.select('top_speed')\
.describe()\
.show()
df.select( [c for c in df.columns if c.startswith('speeding')] )\
.describe()\
.show()
df.select( [c for c in df.columns if 'brake' in c ] )\
.describe()\
.show()
df.select( [c for c in df.columns if 'accel' in c] )\
.describe()\
.show()
df.select( [c for c in df.columns if c.startswith('points')] )\
.describe()\
.show()
In [40]:
schema_cols = [(c.name,c.dataType) for c in df.schema]
for c in schema_cols:
name,dtype = c
if dtype == DoubleType():
df = df.withColumn(name,df[name].cast(FloatType()))
elif dtype == LongType():
df = df.withColumn(name,df[name].cast(IntegerType()))
#
df = df.withColumn('duration_seconds',df['duration_seconds'].cast(IntegerType()))
df.printSchema()
In [41]:
print (" key entries/total: {}/{}"\
.format( df.select('key').distinct().count(), numOfRows )
)
df = df.drop('key')
In [42]:
print (" Number of device_key: {}"\
.format( df.select('device_key').distinct().count() )
)
In [43]:
print ("Number of unique vehicle_key, vehicle_nickname: ",\
df.select('vehicle_key','vehicle_nickname').groupBy('vehicle_key','vehicle_nickname').count().count()
)
df.select('vehicle_key','vehicle_nickname')\
.groupBy('vehicle_key','vehicle_nickname')\
.count().orderBy('vehicle_nickname')\
.toPandas()
#.show(n=100,truncate=False)
Out[43]:
In [44]:
#
df = df.drop('vehicle_key')
In [45]:
# get list of unique device_key's
device_keys = df.select('device_key').distinct().orderBy('device_key').rdd.flatMap(lambda x:x).collect()
# register UDF function to map key to an integer given by its position in the array
getID_udf = F.udf(lambda k: device_keys.index(k), IntegerType())
# create new column
df = df.withColumn('device_id',getID_udf(df['device_key']))
df = df.drop('device_key')
In [46]:
df.printSchema()
In [47]:
# remove the 'address' tag
for c in [c for c in df.columns if 'address' in c]:
df = df.withColumnRenamed(c,c.replace('address_',''))
In [48]:
# ordered list
cols = ['device_id','vehicle_nickname'] \
+ [c for c in df.columns if c.startswith('start')] \
+ [c for c in df.columns if c.startswith('end')] \
+ ['trip_tz'] \
+ ['duration_seconds','idle_seconds'] \
+ ['gps_distance','obd_distance'] \
+ ['top_speed'] \
+ ['hard_accel_count','hard_brake_count'] \
+ ['points_city_count','points_hwy_count'] \
+ [c for c in df.columns if c.startswith('speeding')] \
+ ['speed_um','distance_um']
# check that no cols are left out
print( 'new/old colunm list length {}/{}'.format( len(cols), len(df.columns) ) )
In [49]:
# apply re-shuffling
df = df.select(cols)
#
df = df.orderBy('start_point_timestamp','device_id')
In [50]:
print('\n list of columns: ')
df.columns
Out[50]:
In [51]:
df.limit(10).toPandas()
Out[51]:
In [52]:
print(' Number of selected fields/total: {}/{} '.format( len(df.columns), numOfFieldsRaw ) )
print(' Number of selected rows/total: {}/{} '.format( numOfRows, numOfRowsRaw) )
In [53]:
# Prepared Spark data frame can be written to file system for later use
#! rm -rf ./output
#df.repartition(1).write.parquet('./output')
In [54]:
#spark.stop()
In [55]:
# control cell printout
InteractiveShell.ast_node_interactivity = 'last_expr'
In [56]:
# import graphics library
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
In [57]:
# graphics settings
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20
plt.rc('font', size=BIGGER_SIZE) # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
#
axis_font = {'fontname':'Arial', 'size':'24'}
In [151]:
pdf = df.toPandas()
#write to file for later use
#pdf.to_pickle('zubie.pkl')
-One layer holding a heatmap
-One layer holding a clustered rapresentation of trip starting points. Point clusters can be
recomputed according to the distance scale
In [155]:
# Load necessary modules
#
from IPython.core.display import HTML, display
from IPython.display import IFrame
# Module to handle Pandas Dataframes and geo-data
import geopandas as gpd
from geopandas.tools import sjoin
# Interactive Map with layer support
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap
from folium.element import IFrame
# utilities
import shapely
from shapely.geometry import Point
In [61]:
# Build a GeoSeries of shapely Points
# Select coordinate system ESPG4326 which is the standard WGS84 coordinate system
geo_series = gpd.GeoSeries( pdf.apply(lambda z: Point(z['start_point_lat'], z['start_point_lon']), 1)
,crs={'init': 'epsg:4326'} )
In [62]:
geo_series.head()
Out[62]:
In [63]:
# join geo_series to pdf and drop orinal columns
# In general it is advisable to subset pdf to retain fields relevant for maps
# (not critical on small data sets)
geo_df = gpd.GeoDataFrame( pdf.drop(['start_point_lat', 'start_point_lon'], 1)
,geometry=geo_series )
In [64]:
# display data frame with the new field 'geometry'
# Such field is added as the last one: let's put it as first
# before displaying
geo_df = geo_df[ ['geometry']+geo_df.columns[:-1].tolist() ]
geo_df.head()
Out[64]:
In [65]:
# center map at the mean of the lat-long grid form the data (starting point only)
lat,lon = pdf[['start_point_lat']].mean(), pdf[['start_point_lon']].mean()
geo_map = folium.Map(
location=[np.float(lat), np.float(lon)]
,control_scale = True # show distance scales
,zoom_start = 7 # ajusted to show most of the country
)
In [66]:
def add_heatmap(base_map, gdf, mag_field=None):
"""
Add an heatmap layer to Folium base_map using points from the
POINT-type field 'geometry'in the GeoDataFrame gdf
Values in the mag_field colums can be used as weights
"""
# Create empty lists to hold coordinates and magnitude values if provided
lat, lon, mag = [], [], []
coords = []
# loop through rows of the GeoDataFrame
for i, row in gdf.iterrows():
coords.append([np.float(row.geometry.x), np.float(row.geometry.y)])
lat.append( np.float(row.geometry.x) )
lon.append( np.float(row.geometry.y) )
m = row[mag_field] if mag_field else 1
mag.append( m )
# create a Folium Feature Group for the layer
layer = folium.FeatureGroup(name = 'heatmap')
layer.add_children(HeatMap(zip(lat,lon,mag)))
# add this point layer to the map object
base_map.add_children(layer)
return base_map
In [67]:
def add_clustered_points(base_map, gdf, popup_fields=None ):
"""
Add a cluster point layer to Folium base_map using points from the
POINT-type field 'geometry'in the GeoDataFrame gdf
Values in the popup_fileds column can be used as text to be displayed in
the marker popup window
"""
# Create empty lists to hold coordinates and popup info text if provided
coords, pops = [], []
# loop through rows of the GeoDataFrame
for i, row in gdf.iterrows():
coords.append([np.float(row.geometry.x), np.float(row.geometry.y)])
if popup_fields:
# create a string of HTML code joining values from
# the fields listed in 'popup_fields' with a linebreak between them
label = '<br>'.join([row[field] for field in popup_fields])
# append an IFrame that uses the HTML string to the "popups" list
pops.append( IFrame(label, width = 300, height = 100) )
# create a Folium feature group for the layer
layer = folium.FeatureGroup(name = 'clusters')
# add the clustered points of crime locations and popups to this layer
if not popup_fields: pops = None
layer.add_children( MarkerCluster(locations = coords, popups=pops) )
# add this point layer to the map object
base_map.add_children(layer)
return base_map
- no magnitude field is provided to the heatmap: it will represents density of points
In [68]:
#
geo_map = add_heatmap(geo_map, geo_df)
#
geo_map = add_clustered_points(geo_map, geo_df)
# Optionally you can pass a list of fields to be dispalyed
# in the popup infobox (not advisable on large datasets)
# e.g.:
# geo_map = add_clustered_points(geo_map, geo_df, ['vehicle_nickname','top_speed'])
# add layer control to be shown on the map
folium.LayerControl().add_to( geo_map )
Out[68]:
- zooming in and out will trigger different reclustering of the points. Polygon defining the
cluster boundaries can be visualized as well by hovering over the cluster.
In [69]:
# geo_map.save('geo.html')
geo_map
Out[69]:
In [162]:
from IPython.display import Image
Image(filename='cluster_map_0.png',width=500, height=100)
Out[162]:
In [165]:
from IPython.display import Image
Image(filename='cluster_map_1.png',width=500, height=100)
Out[165]:
In [163]:
from IPython.display import Image
Image(filename='heat_map_0.png',width=500, height=100)
Out[163]:
In [164]:
from IPython.display import Image
Image(filename='heat_map_1.png',width=500, height=100)
Out[164]:
In [70]:
#pdf = pd.read_pickle('zubie.pkl')
In [71]:
# convert seconds to minutes
pdf.rename(columns={'duration_seconds':'duration','idle_seconds':'idle'}, inplace = True)
pdf['duration'] = pdf.eval('duration/60.')
pdf['idle'] = pdf.eval('idle/60.')
# effective driven time
pdf['drive_time'] = pdf.eval('duration - idle')
# mph
pdf['ave_speed'] = pdf.eval('60.*(obd_distance/drive_time)')
# hard maneuvers
pdf['hard_manv_count'] = pdf.eval('hard_accel_count + hard_brake_count')
# hard maneuvers per mile (distance has been already validated to be always >0)
pdf['hard_manv_pm'] = pdf.eval('hard_manv_count/obd_distance')
- geodesic distance (i.e. the shortest) between start-point and end-point of a trip may capture other
feature charateristic, in addition to the effective driven distance (obd_distance)
In [72]:
# calculate geo-distance from lat/lon coordinates
import geopy
from geopy.distance import vincenty
def getGeoDist(row):
start = (row['start_point_lat'],row['start_point_lon'])
end = (row['end_point_lat'],row['end_point_lon'])
return vincenty(start,end).miles
pdf['geo_distance'] = pdf.apply(getGeoDist, axis=1)
In [73]:
columns = ['device_id', 'vehicle_nickname', 'start_point_city' \
,'start_point_state', 'start_point_zipcode', 'start_point_lat' \
,'start_point_lon', 'start_point_timestamp', 'end_point_city' \
,'end_point_state', 'end_point_zipcode', 'end_point_lat', 'end_point_lon' \
,'end_point_timestamp', 'duration','idle','drive_time' \
,'gps_distance','obd_distance','geo_distance' \
,'top_speed','ave_speed' \
,'hard_accel_count', 'hard_brake_count','hard_manv_count', 'hard_manv_pm'\
,'points_city_count', 'points_hwy_count', 'speeding_city_major_count' \
,'speeding_city_minor_count', 'speeding_hwy_major_count' \
,'speeding_hwy_minor_count']
In [74]:
pdf = pdf[columns]
pdf.head()
Out[74]:
In [75]:
pdf.shape[0]
Out[75]:
In [76]:
print(' counts of hard accelerations+brakes, as a fraction of the total: ')
(pdf['hard_manv_count'].value_counts()/pdf.shape[0]).head()
Out[76]:
In [77]:
#
fig, ax = plt.subplots(1,2,figsize=(20,5.5));
# superimpose three distributions, normalized to show the
# relative frequency of each count
args = dict(bins=5, range=(0,5), normed=True, alpha=0.3, align='left'); #opts
ax[0].hist(pdf.hard_accel_count, label='accels', **args);
ax[0].hist(pdf.hard_brake_count, label='brakes', **args);
ax[0].hist(pdf.hard_manv_count, label='accels+brakes',**args);
ax[0].set_xlabel('counts')
ax[0].set_ylabel('frequency [fractional]')
ax[0].legend();
# blank second panel
#ax[1].axis('off');
ax[1].hist(pdf[pdf.hard_manv_count>0].hard_manv_pm,bins=20, range=(0,1), alpha=0.5);
ax[1].set_xlabel('accels+braks per mile')
ax[1].text(0.4,150,' only trips with at least one \n hard accel or one hard brake')
Out[77]:
In [78]:
# keep only nedeed features
vars = ['geo_distance','obd_distance','gps_distance']
data = pdf[vars]
# some data selection for better visualization
dmax= 30
data = data[ (data.gps_distance<dmax) & (data.obd_distance<dmax) & (data.geo_distance<dmax) ]
# flag trips based on the condition on obd and geo distance
hue = ['','obd > geo','obd < geo'] #[legend title,items...]
data[hue[0]] = (data.obd_distance > data.geo_distance).apply(lambda x: hue[1] if x else hue[2])
# prepare for visualization
data.columns = ['geo [mi]', 'obd [mi]','gps [mi]',hue[0]]
sns.set_style('whitegrid', {'axes.edgecolor': '0.5'})
g = sns.PairGrid(data, hue=hue[0], size=3, palette={hue[1]:'g',hue[2]:'r'})
# histo on diagonal
g = g.map_diag(plt.hist, bins=30, histtype="step", linewidth=3)
# scatter on the upper side
g = g.map_upper(plt.scatter, alpha=0.7, edgecolor="w")
# heatmaps (2d-histograms) on the lower side (useful to visualize data clusterization)
# Add plt.histo2d into the sns PairGrid API it's not straightforward
# The plt.histo2d API is not fully compatible with PairGrid
# because it doesn't know how to handle a color= kwarg.
# It is necessary to use a thin wrapper function.
def pairgrid_heatmap(x, y, **kws):
cmap = sns.light_palette(kws.pop("color"), as_cmap=True)
plt.hist2d(x, y, cmap=cmap, cmin=1, **kws)
g.map_lower(pairgrid_heatmap, bins=(np.linspace(0,dmax,dmax+1),np.linspace(0,dmax,dmax+1)))
#
g = g.add_legend()
# adjust number of ticks and avoid overlaps
for ax in g.axes.flat:
ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(4, prune="both"))
ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(4, prune="both"))
# adjust subplots spacing
g.fig.subplots_adjust(hspace=.05, wspace=.05);
In [79]:
print(' num. of trips where gps distance <= geo distance: {}/{} '.format(
pdf[pdf['gps_distance'] <= pdf['geo_distance']].shape[0], pdf.shape[0] )
)
#show data
pdf[pdf['gps_distance'] < pdf['geo_distance']].head()
Out[79]:
In [80]:
print(' num. of trips where obd distance <= geo distance: {}/{} '.format(
pdf[pdf['obd_distance'] <= pdf['geo_distance']].shape[0],pdf.shape[0] )
)
#show data
pdf[pdf['obd_distance'] < pdf['geo_distance']].head()
Out[80]:
In [81]:
pdf = pdf[pdf['obd_distance'] > pdf['geo_distance']]
print( 'Tot. num of rows: {}'.format(pdf.shape[0]))
In [82]:
print(' num. of trips where top speed < ave speed: {}/{} '.format(
pdf[pdf['top_speed'] <= pdf['ave_speed']].shape[0], pdf.shape[0] )
)
# filter out those trips
pdf = pdf[pdf['top_speed'] > pdf['ave_speed']]
In [83]:
print( 'Tot. num of rows: {}'.format(pdf.shape[0]))
- trips with high number of hard meneuvers have higher average and top speed
- trips with non-null highway counts show a top speed peak around highway or non-urban speed limit
- trips with null highway counts show a top speed peak around the city (urban areas) speed limit
- shorter trips are usually urban area trips
In [84]:
# features to plot
vars = ['ave_speed','top_speed','obd_distance']
# prepare data selections
data_1_1 = pdf[ pdf.hard_manv_count>3 ][vars]
data_1_2 = pdf[ pdf.hard_manv_count==0 ][vars]
data_2 = pdf[ ((pdf.drive_time>15) | (pdf.points_hwy_count>0)) ][vars]
data_3 = pdf[ ((pdf.drive_time<15) | (pdf.points_hwy_count==0)) ][vars]
# setup graphics
fig, ax = plt.subplots(1,3,figsize=(20,5),sharey=True);
# first panel
#
sns.kdeplot(data_1_1.top_speed, data_1_1.ave_speed, shade=True, shade_lowest=False, cmap='Blues',alpha=0.7, ax=ax[0])
sns.kdeplot(data_1_2.top_speed, data_1_2.ave_speed, shade=True, shade_lowest=False, cmap='Reds',alpha=0.7, ax=ax[0])
ax[0].set_title('')
ax[0].set_xlabel('top speed [mph]')
ax[0].set_xlim(0,100)
ax[0].set_ylabel('ave speed [mph]')
ax[0].set_ylim(0,75)
ax[0].text(30,60,'hard accel+brake\'s >3 ',color='b', fontsize=20)
ax[0].text(10,5,'hard accel+brake\'s =0 ',color='r', fontsize=20)
ax[0].axvline(x=30, linewidth=1, linestyle='--')
ax[0].axvline(x=70, linewidth=1, linestyle='--')
ax[0].text(60,80,'UK highway \n speed limit',multialignment='center',color='b',fontsize=15)
ax[0].text(20,80,'UK city \n speed limit',multialignment='center',color='b',fontsize=15)
ax[0].grid(False)
# second panel
#
sns.kdeplot(data_2.top_speed, data_2.ave_speed, shade=True, shade_lowest=False, cmap='Greens',alpha=1, ax=ax[1])
ax[1].set_title('')
ax[1].set_xlabel('top speed [mph]')
ax[1].set_xlim(0,100)
ax[1].set_ylabel('ave speed [mph]')
ax[1].set_ylim(0,75)
ax[1].text(10,50,'points hwy count > 0 \n OR \n drive time > 15 min',multialignment='center',color='g',fontsize=20)
ax[1].axvline(x=70, linewidth=1, linestyle='--')
ax[1].text(60,80,'UK highway \n speed limit',multialignment='center',color='b',fontsize=15)
ax[1].grid(False)
# third panel
#
sns.kdeplot(data_3.top_speed, data_3.ave_speed, shade=True, shade_lowest=False, cmap='Purples', alpha=1, ax=ax[2])
ax[2].set_title('')
ax[2].set_xlabel('top speed [mph]')
ax[2].set_xlim(0,100)
ax[2].set_ylabel('ave speed [mph]')
ax[2].set_ylim(0,75)
ax[2].axvline(x=30, linewidth=1, linestyle='--')
ax[2].text(10,50,'points hwy count = 0 \n OR \n drive time < 15 min', multialignment='center',color='Purple',fontsize=20)
ax[2].text(20,80,'UK city \n speed limit',multialignment='center',color='b',fontsize=15)
ax[2].grid(False)
Comments to the plots above:
In [85]:
# define func to show categorical fields and the number of levels
def dump_categories(df):
for col_name in df.columns:
if df[col_name].dtypes == 'object':
unique_cat = len(df[col_name].unique())
print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))
In [86]:
dump_categories(pdf)
zipcode:
In [87]:
pdf[ ['start_point_zipcode','end_point_zipcode']].head(3)
Out[87]:
In [88]:
print( 'toal number of trips: {}'.format(pdf.shape[0]))
print( 'num. of unique starting point zip codes: {}'.format(pdf['start_point_zipcode'].unique().shape[0]))
In [89]:
# query UK postcode web API
# use the first zipcode in the dataset
import urllib
code = pdf['start_point_zipcode'].values[0]
url = 'http://api.postcodes.io/postcodes/'+str(code)
res = urllib.urlopen(url).read()
#print results
code
url
res
Out[89]:
In [90]:
# keep only literal characther of the first part of the code
from string import digits
pdf['start_area_zip']=pdf['start_point_zipcode'].apply(lambda x: str(x).split(' ')[0].translate(None,digits))
pdf['end_area_zip']=pdf['end_point_zipcode'].apply(lambda x: str(x).split(' ')[0].translate(None,digits))
# show some data
pdf[['start_point_zipcode','start_area_zip']].head(3)
Out[90]:
In [91]:
# find list of unique Area Codes
area_code_list = pdf['start_area_zip'].unique().tolist()
second_list = pdf['end_area_zip'].unique().tolist()
area_code_list.extend(x for x in second_list if x not in area_code_list)
print( 'num. of unique area zip codes: {}'.format(len(area_code_list)) )
In [92]:
# define a handy function
def get_fractional_series(s):
"""get a pd series and return a series
of unique values sorted by relative frequency"""
if s.shape[0]==0:
return s
f = s.value_counts()
if f.sum():
f /= f.sum()
return f
In [93]:
# compute relative fractions of starting area codes
fracs = get_fractional_series(pdf['start_area_zip'])
fracs.head()
Out[93]:
In [94]:
# plot first n entries
ax = fracs[:40].plot(kind='bar',figsize=(20,5))
ax.set_xlabel('Area Code',fontsize=24)
ax.set_ylabel('Frequency',fontsize=24)
Out[94]:
In [95]:
# select only the first item and group the other in the 'Other' category
code_list = fracs.index[0]
pdf['start_area_zip'] = pdf['start_area_zip'].apply(lambda x: x if x in code_list else 'Other')
pdf['end_area_zip'] = pdf['end_area_zip'].apply(lambda x: x if x in code_list else 'Other')
# cross check
print(' area code categories (levels={}): '.format(pdf['start_area_zip'].unique().shape[0]))
pdf['start_area_zip'].unique()
Out[95]:
city:
In [96]:
print( 'toal number of trips: {}'.format(pdf.shape[0]))
print( 'num. of unique starting point cities: {}'.format(pdf['start_point_city'].unique().shape[0]))
In [97]:
fracs = get_fractional_series(pdf['start_point_city'])
fracs.head()
Out[97]:
In [98]:
# plot first n entries
ax = fracs[:40].plot(kind='bar',figsize=(20,5))
ax.set_xlabel('Starting Point City',fontsize=24)
ax.set_ylabel('Frequency',fontsize=24)
Out[98]:
In [99]:
#
def cut_fraction(series,f,opt=None):
"""
takes a series and returns a list of UNIQUE items for which
the occurrence frquency higher than f
if option='cumulative' if specified returns items contributing
up to a fraction f of the total
"""
if series.shape[0]==0:
return series
s = series
s.sort_values(ascending=False, inplace=True)
if str(opt).lower()=='cumulative':
s.cumsum()
print('h')
return s[s<f].index.tolist()
else:
return s[s>f].index.tolist()
In [100]:
city_list = cut_fraction(fracs, 0.03)
pdf['start_point_city'] = pdf['start_point_city'].apply(lambda x: x if x in city_list else 'Other')
pdf['end_point_city'] = pdf['end_point_city'].apply(lambda x: x if x in city_list else 'Other')
# cross check
print(' city categories (counts={}): '.format(pdf['start_point_city'].unique().shape[0]))
pdf['start_point_city'].unique()
Out[100]:
(Timeseries) Date Time Features:
In [101]:
pdf.reset_index(inplace=True)
pdf.set_index('start_point_timestamp', inplace=True)
In [102]:
# first two trips
pdf.head(2)
Out[102]:
In [103]:
# last two trips
pdf.tail(2)
Out[103]:
In [104]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
In [105]:
#
fig,ax = plt.subplots(1,2,figsize=(18,5))
# first panel
#
# resampling a 1W, count number of trips
pdf['device_id'].resample('w', how='count')\
.plot(kind='line',style='-o',use_index=True, ax=ax[0])
ax[0].set_ylabel('num. of trips', fontsize=24)
ax[0].set_xlabel('')
# second panel
#
# reamplig ad 1 day, count number of trips
pdf['device_id'].resample('1D', how='count')\
.plot(kind='line',style='-o',use_index=True, ax=ax[1], label='all')
# select only Saturaday and Sundays
d = pdf[pdf.index.dayofweek>=5]['device_id'].resample('1D', how='count')
d[d!=0].plot(kind='line',style='o',use_index=True, ax=ax[1], label='weekends (Sat/Sun)')
ax[1].set_ylabel('num. of trips', fontsize=24)
ax[1].set_xlabel('')
ax[1].legend()
Out[105]:
In [106]:
fig,ax = plt.subplots(1,2,figsize=(18,5))
# a little bit convolute way to show bar labels in the right order
d = pdf.groupby( [pdf.index.to_datetime().strftime('%a'), pdf.index.weekday] )\
.count()\
.reset_index()\
.sort_values('level_1')\
.set_index('level_0')['device_id']
d.plot(kind='bar',ax=ax[0])
ax[0].set_xlabel('')
ax[0].set_ylabel('number of trips',color='g',fontsize=28)
# bar plot grouping by hour
d = pdf.groupby(pdf.index.hour )\
.count()\
['device_id']
d.plot(kind='bar',ax=ax[1])
ax[1].set_xlabel('hour', color='g', fontsize=24)
ax[1].set_ylabel('number of trips',color='g',fontsize=28)
ax[1].grid(False)
# based on the hour-barplot, let's divide the 24h into three categories
# draw the sepration lines
day_zones = [-0.1,6,13,19,23]
for h in day_zones[1:-1]:
ax[1].axvline(x=h+0.5, linewidth=3, color='r', linestyle='--')
fig.subplots_adjust(hspace=.05, wspace=.2);
In [107]:
# cut data
pdf['start_day_zone'] = pd.cut(pdf.index.hour, day_zones,labels=['Early','Morning','Afternoon','Night'])
# workaround as two categories wih the same name are not possible in pd.cut
pdf['start_day_zone'] = pdf['start_day_zone'].apply(lambda x: x if 'Early' not in x else 'Night')
In [108]:
# set datetime index to end_point
pdf.reset_index(inplace=True)
pdf.set_index('end_point_timestamp', inplace=True)
# apply categorization
pdf['end_day_zone'] = pd.cut(pdf.index.hour, day_zones,labels=['Early','Morning','Afternoon','Night'])
pdf['end_day_zone'] = pdf['end_day_zone'].apply(lambda x: x if 'Early' not in x else 'Night')
pdf[['end_day_zone']].head()
# reset index to start_point
pdf.reset_index(inplace=True)
pdf.set_index('start_point_timestamp', inplace=True)
In [109]:
# check results
pdf[['start_day_zone','end_point_timestamp','end_day_zone']].head()
Out[109]:
In [110]:
#reset index to ordinal
pdf.reset_index(inplace=True)
#add weekday feature
pdf['start_weekday'] = pdf['start_point_timestamp'].apply(lambda x : x.to_datetime().strftime('%a') )
pdf['end_weekday'] = pdf['end_point_timestamp'].apply(lambda x : x.to_datetime().strftime('%a') )
(Longitudinal Data) vehicle based data: :
In [111]:
#
plt = pdf.groupby(['vehicle_nickname'],as_index=True)['drive_time']\
.sum().sort_values(ascending=False)\
.apply(lambda x: x/60.)\
.plot(kind='bar',figsize=(20,7))
plt.set_xlabel('vehicle name', fontsize=28)
plt.set_ylabel('accumualted drive time [h]', fontsize=28)
Out[111]:
In [112]:
# to apply the selection it's necessary to add for each row the information about the acculated drive time
# over the entire period for the vehicle of the row
pdf['drive_time_sum'] = pdf.groupby(['vehicle_nickname'],as_index=False)['drive_time'].transform('sum')
In [113]:
print(' number of vehicles: {}'.format(pdf['vehicle_nickname'].unique().shape[0]))
In [114]:
print( ' num of rows after removing vehicle with less then 2h accumulated drive time/total rows: {}/{} '.
format(pdf[pdf['drive_time_sum']>120].shape[0], pdf.shape[0])
)
In [115]:
pdf = pdf[ pdf['drive_time_sum']>120 ]
pdf.drop('drive_time_sum',axis=1,inplace=True)
In [116]:
#
size = pdf.groupby(['vehicle_nickname'],as_index=False).size().sort_values(ascending=False)
plt = size.plot(kind='bar',figsize=(20,6))
plt.set_xlabel('vehicle name', fontsize=28)
plt.set_ylabel('number of trips', fontsize=28)
Out[116]:
In [117]:
#
print(' bin values: ')
np.sort(size)
Out[117]:
In [118]:
# select names of the last four
names = size.index[-4:].tolist()
print(names)
In [119]:
# select DF rows
n = pdf.shape[0]
pdf = pdf[ ~pdf.vehicle_nickname.isin(names) ]
print( ' num of rows after removing vehicle with less then 10 trips/total rows: {}/{} '.
format(pdf.shape[0],n)
)
In [120]:
print(' number of vehicles: {}'.format(pdf['vehicle_nickname'].unique().shape[0]))
In [121]:
pdf.dtypes
Out[121]:
In [122]:
# list of colums with ordering
columns = [ 'vehicle_nickname'\
#
,'start_point_state','start_point_city','start_area_zip'\
,'start_point_lat','start_point_lon'\
,'start_day_zone','start_weekday'\
#
,'end_point_state','end_point_city','end_area_zip'\
,'end_point_lat','end_point_lon'\
,'end_day_zone','end_weekday'\
#
,'top_speed','ave_speed'\
,'obd_distance','geo_distance'\
,'drive_time','idle'\
,'hard_manv_count'
]
# #
# ,'points_city_count','points_hwy_count'\
# ,'speeding_city_major_count','speeding_city_minor_count'\
# ,'speeding_hwy_major_count','speeding_hwy_minor_count'
#]
In [123]:
# extract columns
pdf = pdf[ columns ]
In [124]:
# final Data Frame shape
print(' Number of rows, features:')
pdf.shape
Out[124]:
In [125]:
numeric_features = list(pdf.dtypes[(pdf.dtypes == 'int64') | (pdf.dtypes == 'float64') ].index)
categorical_features = list(pdf.dtypes[(pdf.dtypes == 'object') ].index)
In [126]:
numeric_features
Out[126]:
In [127]:
categorical_features
Out[127]:
A Random Forest Regression model will be used for the prediction.
In [128]:
X = pdf
y = X.pop('hard_manv_count')
In [129]:
y = (y==0).astype(int)
In [130]:
dump_categories(X)
For the regression models to work fetures with categorical values need to be converted into features with integer values. There tipically two ways to do accomplish that:
LabelEncoder: each category of the feature is mapped to an integer
OneHotEncoding is preferred when a numerical ordering among the category has no real meaning. Such encoding implies the creation of a very large number of features that should be than reduce via dimensionality reduction techniques. It also make the model more prone to overfitting and can be computationally intensive.
RandomForest regression can handle LabelEncoding therefore this is the chosen encoding. A test was perfomed also with the OneHotEncoding and no differences in terms of performances have been observed
In [131]:
def onehot_encoder(df, col_list):
if col_list:
feat_list = col_list
else:
feat_list = list(df.dtypes[(df.dtypes == 'object') ].index)
for x in col_list:
dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
df = df.drop(x, 1)
df = pd.concat([df, dummies], axis=1)
return df
In [132]:
def label_encoder(df, col_list=None):
if col_list:
feat_list = col_list
else:
feat_list = list(df.dtypes[(df.dtypes == 'object') ].index)
for feat in feat_list:
unique_cat = df[feat].unique().tolist()
df[feat] = df[feat].apply(lambda x: unique_cat.index(x))
return df
In [133]:
# example of OneHotEncoding of the feature 'start_weekday'
#dm = pd.get_dummies(X['start_weekday'],prefix='start_weekday', dummy_na=False)
#dm.head()
In [134]:
X = label_encoder(X)
In [135]:
X.head()
Out[135]:
In [136]:
# import module
from sklearn.ensemble import RandomForestRegressor
# The error metric. In this case, we will use c-stat (aka ROC/AUC)
from sklearn.metrics import roc_auc_score
In [137]:
#
model = RandomForestRegressor(n_estimators=100, max_depth=None, oob_score=True, random_state=42)
#
model.fit(X, y)
Out[137]:
In [138]:
y_oob = model.oob_prediction_
print(' ROC AUC score: {0:2.5f}'.format(roc_auc_score(y, y_oob )) )
In [139]:
#just as cross check
model.feature_importances_
print(' sum of fractional feature importances: {0:0.2f}'.format(sum(model.feature_importances_)))
In [140]:
# First prepare a plotting function to able to handle also OneHotEncoded features if any
# If features are exploded via OneHotEncoding they have to be summed up before visualization
def graph_feature_importances(model, feature_names, autoscale=True, headroom=0.05, width=10, summarized_columns=None):
"""
Graphs the feature importances of on using a horizontal bar chart.
feature_names = A list of the names of those featurs, displayed on the Y axis.
autoscale = True (Automatically adjust the X axis size to the largest feature +.headroom) / False = scale from 0 to 1
headroom = used with autoscale, .05 default
width=figure width in inches
summarized_columns = a list of column prefixes to summarize on, for dummy variables (e.g. ["day_"] would summarize all day_ vars
"""
if autoscale:
x_scale = model.feature_importances_.max()+ headroom
else:
x_scale = 1
feature_dict=dict(zip(feature_names, model.feature_importances_))
if summarized_columns:
#some dummy columns need to be summarized
for col_name in summarized_columns:
#sum all the features that contain col_name, store in temp sum_value
sum_value = sum(x for i, x in feature_dict.items() if col_name in i )
#now remove all keys that are part of col_name
keys_to_remove = [i for i in feature_dict.keys() if col_name in i ]
for i in keys_to_remove:
feature_dict.pop(i)
#lastly, read the summarized field
feature_dict[col_name] = sum_value
results = pd.Series(feature_dict)
results.sort_values(inplace=True)
results.plot(kind="barh", figsize=(width,len(results)/2), xlim=(0,x_scale))
In [141]:
graph_feature_importances(model, X.columns, summarized_columns=categorical_features)
In [142]:
# turn this off for this section
InteractiveShell.ast_node_interactivity = 'none'
In [143]:
# init values to default values
n_trees_best = 100
features_type_best = 'auto'
samples_leaf_best = 1
max_depth_best = None
In [144]:
roc_scores = []
n_trees = [30, 50, 100, 200, 500, 1000, 2000]
for trees in n_trees:
model = RandomForestRegressor( n_estimators = trees, oob_score=True, n_jobs=-1, random_state=42 );
model.fit(X, y);
score = roc_auc_score(y, model.oob_prediction_)
roc_scores.append(score)
print(' num of trees: {0:3d} -> roc score: {1:0.5f}'.format(trees,score))
#
n_trees_best = n_trees[ np.argmax(roc_scores) ]
print('\n num of trees best option: {}'.format(n_trees_best))
#
ax = pd.Series(roc_scores, n_trees).plot(fontsize=15, ylim=(0.7,0.9));
ax.set_ylabel('ROC AUC score',**axis_font)
ax.set_xlabel('num of trees',**axis_font)
In [145]:
roc_scores = []
tree_deps = [4, 6, 10, 15]
for dep in tree_deps:
model = RandomForestRegressor(n_estimators=n_trees_best, max_depth=dep, oob_score=True, n_jobs=-1, random_state=42 );
model.fit(X, y);
score = roc_auc_score(y, model.oob_prediction_)
roc_scores.append(score)
print(' trees max depth: {0:s} -> roc score: {1:0.5f}'.format(str(dep),score))
#
tree_depth_best = tree_deps[ np.argmax(roc_scores) ]
print('\n tree max depth best option: {}'.format(tree_depth_best))
#ax = pd.Series(roc_scores, tree_deps).plot(kind='barh',fontsize=15, ylim=(0.7,0.9));
#ax.set_ylabel('ROC AUC score',**axis_font)
#ax.set_xlabel('num of trees',**axis_font)
In [146]:
roc_scores = []
max_features_type = ['auto', None, 'sqrt', 'log2', 0.9, 0.2]
for max_features in max_features_type:
model = RandomForestRegressor(n_estimators = n_trees_best, oob_score=True, n_jobs=-1, max_depth=tree_depth_best,random_state=42,max_features=max_features )
model.fit(X, y)
score = roc_auc_score(y, model.oob_prediction_)
roc_scores.append(score)
print(' max features option: {0:s} -> roc score: {1:0.5f}'.format(str(max_features),score))
#
features_type_best = max_features_type[ np.argmax(roc_scores) ]
print('\n max_feature_type best option: {}'.format(features_type_best))
#ax = pd.Series(roc_scores, max_features_type).plot(kind="barh", fontsize=20, xlim=(.6,.9));
#ax.set_xlabel('ROC AUC score',**axis_font)
#ax.set_ylabel('max feature type',**axis_font)
In [147]:
roc_scores = []
min_samples_leaf_num = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
#
for min_samples in min_samples_leaf_num:
model = RandomForestRegressor(n_estimators=n_trees_best,
oob_score=True,
n_jobs=-1,
random_state=42,
max_depth=tree_depth_best,
max_features = features_type_best,
min_samples_leaf=min_samples)
model.fit(X, y)
score = roc_auc_score(y, model.oob_prediction_)
roc_scores.append(score)
print(' num of min_samples_leaf: {0:2d} -> roc score: {1:0.5f}'.format(min_samples,score))
#
samples_leaf_best = min_samples_leaf_num[ np.argmax(roc_scores) ]
print('\n min_samples_leaf_num best option: {}'.format(samples_leaf_best))
#ax = pd.Series(roc_scores, min_samples_leaf_num).plot(fontsize=20, ylim=(0.7,0.9));
#ax.set_ylabel('ROC AUC score',**axis_font)
#ax.set_xlabel('min samples leaf',**axis_font)
from sklearn.grid_search import GridSearchCV
model = RandomForestRegressor(random_state=42, n_estimators=2000) param_grid = { "max_features" : [3, 5], "max_depth" : [4, 10, 20], "min_samples_split" : [2, 4, 10] }
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=2) grid_search.fit(X, y) print grid_search.bestparams
In [ ]:
# restore full cell output
InteractiveShell.ast_node_interactivity = 'all'
In [ ]: