In [62]:
import pandas as pd
import numpy as np
%matplotlib inline
In [63]:
sq = pd.Series({'row1': 'row1 col a', 'row 2': 'row2 col a'})
sq
Out[63]:
In [64]:
sq.index
Out[64]:
In [72]:
df = pd.DataFrame(
{
'column_a': {'row1': 'row1 col a', 'row 2': 'row2 col a'},
'column_b': {'row1': 'row1 col b', 'row 2': 'row2 col b'},
})
df
Out[72]:
In [73]:
df.index
Out[73]:
In [74]:
df.columns
Out[74]:
In [75]:
df.columns = ['new_column_a', 'new_column_b']
df
Out[75]:
In [76]:
print(type(df.new_column_a))
df.new_column_a
Out[76]:
In [9]:
type(df.new_column_a)
Out[9]:
In [77]:
print(type(df.new_column_a.values))
df.new_column_a.values
Out[77]:
In [78]:
df = pd.read_csv('train.csv')
Let's use .info()
to find the answer
In [79]:
df.info()
RangeIndex: 10886 entries, 0 to 10885
=> there're 10886 rows (objects)Data columns (total 12 columns):
=> there're 12 columns (features)dtypes: float64(3), int64(8), object(1)
=> three types (float, int, object)memory usage: 1020.6+ KB
=> use about 1MB
In [80]:
print("count samples & features: ", df.shape)
print("Are there missing values: ", df.isnull().any().any())
In [87]:
df.head(10)
Out[87]:
In [88]:
df.season.unique()
#df[''].unique()
Out[88]:
In [92]:
df.season.nunique()
Out[92]:
In [93]:
df.columns
Out[93]:
In [95]:
for column in df.columns:
print(column, df[column].nunique())
In [97]:
df.holiday.unique()
Out[97]:
In [96]:
df[ ['holiday'] ].info()
In [98]:
df['holiday'] = df['holiday'].astype('int8')
df[ ['holiday'] ].info()
In [102]:
def optimize_memory(df):
for cat_var in ['holiday', 'weather', 'season', 'workingday']:
df[cat_var] = df[cat_var].astype('int8')
for float_var in ['temp', 'atemp', 'windspeed']:
df[float_var] = df[float_var].astype('float16')
for int_var in ['casual', 'registered', 'count']:
df[int_var] = df[int_var].astype('int16')
return df
df = optimize_memory(df)
df.info()
In [103]:
df['datetime'] = pd.to_datetime(df['datetime'])
df.info()
In [105]:
df = pd.read_csv('train.csv', parse_dates=['datetime'])
df = optimize_memory(df)
df.info()
https://www.kaggle.com/c/bike-sharing-demand/data
In [107]:
df.head()
Out[107]:
In [110]:
df['count'].plot(figsize=(20, 10));
In [111]:
df['casual'].plot()
Out[111]:
In [113]:
df['registered'].plot()
Out[113]:
In [118]:
(df['count'] == df['casual'] + df['registered']).all()
Out[118]:
In [119]:
df.datetime.map(lambda x: x.day)
Out[119]:
In [120]:
df.datetime.dt.hour
Out[120]:
In [122]:
def plot_by_hour(data, year=None, agg='sum'):
data['hour'] = data.datetime.dt.hour
dd = data[ data.datetime.dt.year == year ] if year else data
by_hour = dd.groupby(['hour', 'workingday'])['count'].agg(agg).unstack()
return by_hour.plot(kind='bar', ylim=(0, 80000), figsize=(15,5), width=0.9, title="Year = {0}".format(year))
plot_by_hour(df, year=2011)
plot_by_hour(df, year=2012);
In [123]:
def plot_by_year(data, agg_attr, title):
data['year'] = data.datetime.dt.year
data['month'] = data.datetime.dt.month
data['hour'] = data.datetime.dt.hour
by_year = data.groupby([agg_attr, 'year'])['count'].agg('sum').unstack()
return by_year.plot(kind='bar', figsize=(15,5), width=0.9, title=title)
plot_by_year(df, 'month', "Rent bikes per month in 2011 and 2012")
plot_by_year(df, 'hour', "Rent bikes per hour in 2011 and 2012");
In [124]:
df[ ['count', 'year'] ].boxplot(by="year", figsize=(15, 6));
In [125]:
for year in [2011, 2012]:
for workingday in [0, 1]:
dd = df[ (df.datetime.dt.year == year) | (df.workingday == workingday) ]
dd[ ['count', 'month'] ].boxplot(by="month", figsize=(15, 6));
In [ ]:
In [126]:
weather = {1: 'Clear', 2: 'Mist', 3: 'Light Snow', 4: 'Heavy Rain'}
df['weather_label'] = df.weather.map(lambda x: weather[x])
df['weather_label'].unique()
Out[126]:
In [37]:
df[ ['weather', 'season'] ].apply(lambda x: 'weather-{0}, season-{1}'.format(x['weather'], x['season']), axis=1).head()
Out[37]:
In [127]:
df.year = df.datetime.dt.year
df['year'].value_counts()
Out[127]:
In [128]:
df['month'].value_counts()
Out[128]:
In [129]:
df.groupby('year')['month'].value_counts()
Out[129]:
In [131]:
df.groupby('year')['count'].min()
Out[131]:
In [132]:
df.groupby('year')['count'].max()
Out[132]:
In [61]:
df.groupby('year')['count'].agg(np.max)
Out[61]:
In [134]:
for agg_func in [np.mean, np.median, np.min, np.max]:
print(agg_func.__name__, df.groupby(['year', 'month'])['count'].agg(agg_func))
In [136]:
df.sort_values(by=['year', 'month'], ascending=False).head()
Out[136]:
In [139]:
df.to_csv('df.csv', index=False)
In [140]:
!head df.csv
In [141]:
df.to_hdf('df.h5', 'df')
In [ ]: