In [198]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#pd.set_option('display.mpl_style', 'default') #deprecated I believe
pd.options.display.mpl_style = 'default'
%matplotlib inline

In [14]:
#weather_2012_final = pd.read_csv('mtl-temps.csv', index_col='Date/Time')
#weather_2012_final.plot()

In [17]:
url_template = "http://climate.weather.gc.ca/climateData/bulkdata_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit=Download+Data"
url = url_template.format(month=3, year=2012)
weather_mar2012 = pd.read_csv('mtl-temps.csv', skiprows=16, index_col='Date/Time', parse_dates=True, encoding='latin1')

In [135]:
weather_mar2012.dropna(axis=1, how='any')[:3]


Out[135]:
Year Month Day Time
Date/Time
2015-03-01 00:00:00 2015 3 1 00:00
2015-03-01 01:00:00 2015 3 1 01:00
2015-03-01 02:00:00 2015 3 1 02:00

In [25]:
import pandas.io.data as web
import datetime

start = datetime.datetime(2013, 1, 1)
end = datetime.datetime(2015, 1, 27)
data = web.DataReader?

In [ ]:
data = web.DataReader

In [26]:
from pandas.io import wb

In [27]:
wb.search()

In [28]:
pwd


Out[28]:
u'/Users/me/Documents/Py/sci-py'

In [29]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols)

In [36]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols)
ratings.head()


Out[36]:
user_id movie_id rating unix_timestamp
0 196 242 3 881250949
1 186 302 3 891717742
2 22 377 1 878887116
3 244 51 2 880606923
4 166 346 1 886397596

In [37]:
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5))
movies.head()


Out[37]:
movie_id title release_date video_release_date imdb_url
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2...
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(...
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%...
3 4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%...
4 5 Copycat (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995)

In [44]:
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)
movie_ratings.head()
lens.tail()


Out[44]:
movie_id title release_date video_release_date imdb_url user_id rating unix_timestamp age sex occupation zip_code
99995 748 Saint, The (1997) 14-Mar-1997 NaN http://us.imdb.com/M/title-exact?Saint%2C%20Th... 729 4 893286638 19 M student 56567
99996 751 Tomorrow Never Dies (1997) 01-Jan-1997 NaN http://us.imdb.com/M/title-exact?imdb-title-12... 729 3 893286338 19 M student 56567
99997 879 Peacemaker, The (1997) 01-Jan-1997 NaN http://us.imdb.com/M/title-exact?Peacemaker%2C... 729 3 893286299 19 M student 56567
99998 894 Home Alone 3 (1997) 01-Jan-1997 NaN http://us.imdb.com/M/title-exact?imdb-title-11... 729 1 893286511 19 M student 56567
99999 901 Mr. Magoo (1997) 25-Dec-1997 NaN http://us.imdb.com/M/title-exact?imdb-title-11... 729 1 893286491 19 M student 56567

In [41]:
most_rated = lens.groupby('title').size().order(ascending=False)[:10]
print most_rated


title
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
dtype: int64

In [46]:
lens.title.value_counts()[:10]


Out[46]:
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
dtype: int64

In [136]:
movie_stats = lens.groupby('title').agg({'rating':[np.size, np.mean, np.median]})
movie_stats[:4]


Out[136]:
rating
size mean median
title
'Til There Was You (1997) 9 2.333333 2
1-900 (1994) 5 2.600000 3
101 Dalmatians (1996) 109 2.908257 3
12 Angry Men (1957) 125 4.344000 4

In [60]:
movie_stats.sort([('rating', 'mean')], ascending=False).head()


Out[60]:
rating
size mean median
title
They Made Me a Criminal (1939) 1 5 5
Marlene Dietrich: Shadow and Light (1996) 1 5 5
Saint of Fort Washington, The (1993) 2 5 5
Someone Else's America (1995) 1 5 5
Star Kid (1997) 3 5 5

In [62]:
movie_stats.sort([('rating', 'size')], ascending=False).head()
#note DF use .sort  Series uses .order


Out[62]:
rating
size mean median
title
Star Wars (1977) 583 4.358491 5
Contact (1997) 509 3.803536 4
Fargo (1996) 508 4.155512 4
Return of the Jedi (1983) 507 4.007890 4
Liar Liar (1997) 485 3.156701 3

In [80]:
atleast_100 = movie_stats['rating']['size'] >= 100
#print movie_stats[atleast_100].sort([('rating', 'mean')], ascending=False)[:15]
#atleast_100
#movie_stats['rating']['size']
#movie_stats['rating'].size
atleast_100
movie_stats[atleast_100].sort([('rating', 'mean')], ascending=False)[:15]


Out[80]:
rating
size mean median
title
Close Shave, A (1995) 112 4.491071 5
Schindler's List (1993) 298 4.466443 5
Wrong Trousers, The (1993) 118 4.466102 5
Casablanca (1942) 243 4.456790 5
Shawshank Redemption, The (1994) 283 4.445230 5
Rear Window (1954) 209 4.387560 5
Usual Suspects, The (1995) 267 4.385768 5
Star Wars (1977) 583 4.358491 5
12 Angry Men (1957) 125 4.344000 4
Citizen Kane (1941) 198 4.292929 5
To Kill a Mockingbird (1962) 219 4.292237 4
One Flew Over the Cuckoo's Nest (1975) 264 4.291667 4
Silence of the Lambs, The (1991) 390 4.289744 4
North by Northwest (1959) 179 4.284916 4
Godfather, The (1972) 413 4.283293 5

In [85]:
most_50 = lens.groupby('movie_id').size().order(ascending=False)[:5]
most_50
users.age


Out[85]:
movie_id
50          583
258         509
100         508
181         507
294         485
dtype: int64

In [138]:
users.age.hist(bins=30)
plt.title("Distribution of users' ages")
plt.ylabel('count of users')
plt.xlabel('age');



In [94]:
users.age.hist()
users.age[:11]


Out[94]:
0     24
1     53
2     23
3     24
4     33
5     42
6     57
7     36
8     29
9     53
10    39
Name: age, dtype: int64

In [99]:
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
lens['age_group'] = pd.cut(lens.age, range(0, 81, 10), right=False, labels=labels) #exclusive of upper bounds in each bin
lens['age_group']
print lens[['age', 'age_group']].drop_duplicates()[:10]
#pd.cut() to bin numeric data


      age age_group
0      60     60-69
397    21     20-29
459    33     30-39
524    30     30-39
782    23     20-29
995    29     20-29
1229   26     20-29
1664   31     30-39
1942   24     20-29
2270   32     30-39

In [100]:
lens.set_index('movie_id', inplace=True)

In [103]:
by_age = lens.ix[most_50.index].groupby(['title', 'age_group'])
by_age.rating.mean().head(15)


Out[103]:
title             age_group
Contact (1997)    0-9          5.000000
                  10-19        3.693878
                  20-29        3.785714
                  30-39        3.847458
                  40-49        3.866667
                  50-59        3.739130
                  60-69        3.777778
Fargo (1996)      10-19        3.937500
                  20-29        4.010471
                  30-39        4.230769
                  40-49        4.294118
                  50-59        4.442308
                  60-69        4.000000
                  70-79        4.333333
Liar Liar (1997)  0-9          4.000000
Name: rating, dtype: float64

In [109]:
by_age.rating.mean().unstack(1).fillna(0)


Out[109]:
age_group 0-9 10-19 20-29 30-39 40-49 50-59 60-69 70-79
title
Contact (1997) 5 3.693878 3.785714 3.847458 3.866667 3.739130 3.777778 0.000000
Fargo (1996) 0 3.937500 4.010471 4.230769 4.294118 4.442308 4.000000 4.333333
Liar Liar (1997) 4 3.191489 3.321244 3.080645 2.916667 3.000000 3.100000 2.000000
Return of the Jedi (1983) 4 4.488372 4.067308 3.805970 4.000000 4.000000 3.333333 5.000000
Star Wars (1977) 3 4.630435 4.413043 4.305732 4.166667 4.469388 4.000000 4.000000

In [110]:
lens.reset_index('movie_id', inplace=True)

In [112]:
pivoted = lens.pivot_table(index=['movie_id', 'title'],
                           cols=['sex'],
                           values='rating',
                           fill_value=0)
print pivoted.head()


sex                                F         M
movie_id title                                
1        Toy Story (1995)   3.789916  3.909910
2        GoldenEye (1995)   3.368421  3.178571
3        Four Rooms (1995)  2.687500  3.108108
4        Get Shorty (1995)  3.400000  3.591463
5        Copycat (1995)     3.772727  3.140625

In [113]:
pivoted['diff'] = pivoted.M - pivoted.F
print pivoted.head()


sex                                F         M      diff
movie_id title                                          
1        Toy Story (1995)   3.789916  3.909910  0.119994
2        GoldenEye (1995)   3.368421  3.178571 -0.189850
3        Four Rooms (1995)  2.687500  3.108108  0.420608
4        Get Shorty (1995)  3.400000  3.591463  0.191463
5        Copycat (1995)     3.772727  3.140625 -0.632102

In [114]:
pivoted.reset_index('movie_id', inplace=True)

In [137]:
disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff']
disagreements.order().plot(kind='barh', figsize=[9, 15])
plt.title('Male vs. Female Avg. Ratings\n(Difference > 0 = Favored by Men)')
plt.ylabel('Title')
plt.xlabel('Average Rating Difference');



In [118]:
SALES=pd.read_csv("sample-salesv2.csv")
SALES.head()


Out[118]:
account number name sku category quantity unit price ext price date
0 296809 Carroll PLC QN-82852 Belt 13 44.48 578.24 2014-09-27 07:13:03
1 98022 Heidenreich-Bosco MJ-21460 Shoes 19 53.62 1018.78 2014-07-29 02:10:44
2 563905 Kerluke, Reilly and Bechtelar AS-93055 Shirt 12 24.16 289.92 2014-03-01 10:51:24
3 93356 Waters-Walker AS-93055 Shirt 5 82.68 413.40 2013-11-17 20:41:11
4 659366 Waelchi-Fahey AS-93055 Shirt 18 99.64 1793.52 2014-01-03 08:14:27

In [125]:
report1 = SALES.pivot_table(values=['quantity'],index=['name'],columns=['category'], aggfunc=[np.sum])
report = SALES.pivot_table(values=['quantity'],index=['name'],columns=['category'], aggfunc=[np.sum, np.mean, np.median])
report1.head(n=10)


Out[125]:
sum
quantity
category Belt Shirt Shoes
name
Berge LLC 120 166 236
Carroll PLC 184 257 236
Cole-Eichmann 147 236 155
Davis, Kshlerin and Reilly 33 161 185
Ernser, Cruickshank and Lind 97 262 96
Gorczany-Hahn 103 237 100
Hamill-Hackett 32 148 239
Hegmann and Sons 88 278 240
Heidenreich-Bosco 106 92 233
Huel-Haag 43 200 115

In [139]:
report = SALES.pivot_table(values=['ext price','quantity'],index=['name'],columns=['category'], aggfunc=[np.sum])
report[:2]


Out[139]:
sum
ext price quantity
category Belt Shirt Shoes Belt Shirt Shoes
name
Berge LLC 6033.53 9670.24 14361.10 120 166 236
Carroll PLC 9359.26 13717.61 12857.44 184 257 236

In [140]:
sales=pd.read_csv("sample-salesv2.csv",parse_dates=['date'])
sales.head()


Out[140]:
account number name sku category quantity unit price ext price date
0 296809 Carroll PLC QN-82852 Belt 13 44.48 578.24 2014-09-27 07:13:03
1 98022 Heidenreich-Bosco MJ-21460 Shoes 19 53.62 1018.78 2014-07-29 02:10:44
2 563905 Kerluke, Reilly and Bechtelar AS-93055 Shirt 12 24.16 289.92 2014-03-01 10:51:24
3 93356 Waters-Walker AS-93055 Shirt 5 82.68 413.40 2013-11-17 20:41:11
4 659366 Waelchi-Fahey AS-93055 Shirt 18 99.64 1793.52 2014-01-03 08:14:27

In [146]:
#sales['ext price'] = sales['total price']
#sales.columns
#sales = sales.rename(['account number', 'name', 'sku', 'category', 'quantity', 'unit price', 'total price', 'date'])

In [147]:
customers = sales[['name','ext price','date']]
customers.head()


Out[147]:
name ext price date
0 Carroll PLC 578.24 2014-09-27 07:13:03
1 Heidenreich-Bosco 1018.78 2014-07-29 02:10:44
2 Kerluke, Reilly and Bechtelar 289.92 2014-03-01 10:51:24
3 Waters-Walker 413.40 2013-11-17 20:41:11
4 Waelchi-Fahey 1793.52 2014-01-03 08:14:27

In [153]:
customer_group = customers.groupby('name')
customer_group.size()[:6]


Out[153]:
name
Berge LLC                       52
Carroll PLC                     57
Cole-Eichmann                   51
Davis, Kshlerin and Reilly      41
Ernser, Cruickshank and Lind    47
Gorczany-Hahn                   42
dtype: int64

In [180]:
sales_totals5 = customer_group.sum().sort(columns='ext price').head()

sales_totals = customer_group.sum().sort(columns='ext price')
sales_totals[:2]


Out[180]:
ext price
name
Davis, Kshlerin and Reilly 19054.76
Huel-Haag 21087.88

In [163]:
my_plot = sales_totals.plot(kind='bar')



In [169]:
sales_totals5.plot(kind='barh')


Out[169]:
<matplotlib.axes._subplots.AxesSubplot at 0x119f92d10>

In [171]:
sales_totals5.plot(kind='pie', subplots=True)


Out[171]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x119d1c690>], dtype=object)

In [175]:
my_plot = sales_totals.sort(columns='ext price',ascending=True).plot(kind='barh', legend=None, title="Total Sales by Customer")
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales ($)")


Out[175]:
<matplotlib.text.Text at 0x11ad4bb90>

In [183]:
my_plot = sales_totals5.sort(columns='ext price',ascending=True).plot(kind='pie', legend=None, title="Total Sales by Customer", x='ext price', y='ext price')



In [184]:
customers = sales[['name','category','ext price','date']]
customers.head()


Out[184]:
name category ext price date
0 Carroll PLC Belt 578.24 2014-09-27 07:13:03
1 Heidenreich-Bosco Shoes 1018.78 2014-07-29 02:10:44
2 Kerluke, Reilly and Bechtelar Shirt 289.92 2014-03-01 10:51:24
3 Waters-Walker Shirt 413.40 2013-11-17 20:41:11
4 Waelchi-Fahey Shirt 1793.52 2014-01-03 08:14:27

In [185]:
category_group=customers.groupby(['name','category']).sum()
category_group.head()


Out[185]:
ext price
name category
Berge LLC Belt 6033.53
Shirt 9670.24
Shoes 14361.10
Carroll PLC Belt 9359.26
Shirt 13717.61

In [186]:
category_group.unstack().head()


Out[186]:
ext price
category Belt Shirt Shoes
name
Berge LLC 6033.53 9670.24 14361.10
Carroll PLC 9359.26 13717.61 12857.44
Cole-Eichmann 8112.70 14528.01 7794.71
Davis, Kshlerin and Reilly 1604.13 7533.03 9917.60
Ernser, Cruickshank and Lind 5894.38 16944.19 5250.45

In [187]:
my_plot = category_group.unstack().plot(kind='bar',stacked=True,title="Total Sales by Customer")
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales")


Out[187]:
<matplotlib.text.Text at 0x11cb5c810>

In [188]:
my_plot = category_group.unstack().plot(kind='bar',stacked=True,title="Total Sales by Customer",figsize=(9, 7))
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales")
my_plot.legend(["Total","Belts","Shirts","Shoes"], loc=9,ncol=4)


Out[188]:
<matplotlib.legend.Legend at 0x11f98ed10>

In [189]:
purchase_patterns = sales[['ext price','date']]
purchase_patterns.head()


Out[189]:
ext price date
0 578.24 2014-09-27 07:13:03
1 1018.78 2014-07-29 02:10:44
2 289.92 2014-03-01 10:51:24
3 413.40 2013-11-17 20:41:11
4 1793.52 2014-01-03 08:14:27

In [190]:
purchase_plot = purchase_patterns['ext price'].hist(bins=20)
purchase_plot.set_title("Purchase Patterns")
purchase_plot.set_xlabel("Order Amount($)")
purchase_plot.set_ylabel("Number of orders")


Out[190]:
<matplotlib.text.Text at 0x119689350>

In [191]:
purchase_patterns = sales[['ext price','date']]
purchase_patterns.head()


Out[191]:
ext price date
0 578.24 2014-09-27 07:13:03
1 1018.78 2014-07-29 02:10:44
2 289.92 2014-03-01 10:51:24
3 413.40 2013-11-17 20:41:11
4 1793.52 2014-01-03 08:14:27

In [192]:
purchase_patterns = purchase_patterns.set_index('date')
purchase_patterns.head()


Out[192]:
ext price
date
2014-09-27 07:13:03 578.24
2014-07-29 02:10:44 1018.78
2014-03-01 10:51:24 289.92
2013-11-17 20:41:11 413.40
2014-01-03 08:14:27 1793.52

In [197]:
purchase_patterns.resample('M',how=sum)  #wtf?
#purchase_patterns.resample?
#purchase_plot = purchase_patterns.resample('M',how=sum).plot(title="Total Sales by Month",legend=None)


Out[197]:
ext price
date
2013-10-31 14085.27
2013-11-30 44932.78
2013-12-31 57752.23
2014-01-31 53161.35
2014-02-28 51783.56
2014-03-31 44991.85
2014-04-30 34098.05
2014-05-31 50001.02
2014-06-30 37477.65
2014-07-31 41686.71
2014-08-31 55497.09
2014-09-30 57107.51
2014-10-31 37268.83

In [199]:
pwd


Out[199]:
u'/Users/me/Documents/Py/sci-py'

In [ ]: