notebook.community

Edit and run



In [198]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#pd.set_option('display.mpl_style', 'default') #deprecated I believe
pd.options.display.mpl_style = 'default'
%matplotlib inline



In [14]:

    
#weather_2012_final = pd.read_csv('mtl-temps.csv', index_col='Date/Time')
#weather_2012_final.plot()



In [17]:

    
url_template = "http://climate.weather.gc.ca/climateData/bulkdata_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit=Download+Data"
url = url_template.format(month=3, year=2012)
weather_mar2012 = pd.read_csv('mtl-temps.csv', skiprows=16, index_col='Date/Time', parse_dates=True, encoding='latin1')



In [135]:

    
weather_mar2012.dropna(axis=1, how='any')[:3]









    Out[135]:






  
    
      
      Year
      Month
      Day
      Time
    
    
      Date/Time
      
      
      
      
    
  
  
    
      2015-03-01 00:00:00
       2015
       3
       1
       00:00
    
    
      2015-03-01 01:00:00
       2015
       3
       1
       01:00
    
    
      2015-03-01 02:00:00
       2015
       3
       1
       02:00



In [25]:

    
import pandas.io.data as web
import datetime

start = datetime.datetime(2013, 1, 1)
end = datetime.datetime(2015, 1, 27)
data = web.DataReader?



In [ ]:

    
data = web.DataReader



In [26]:

    
from pandas.io import wb



In [27]:

    
wb.search()



In [28]:

    
pwd









    Out[28]:





u'/Users/me/Documents/Py/sci-py'



In [29]:

    
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols)



In [36]:

    
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols)
ratings.head()









    Out[36]:






  
    
      
      user_id
      movie_id
      rating
      unix_timestamp
    
  
  
    
      0
       196
       242
       3
       881250949
    
    
      1
       186
       302
       3
       891717742
    
    
      2
        22
       377
       1
       878887116
    
    
      3
       244
        51
       2
       880606923
    
    
      4
       166
       346
       1
       886397596



In [37]:

    
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5))
movies.head()









    Out[37]:






  
    
      
      movie_id
      title
      release_date
      video_release_date
      imdb_url
    
  
  
    
      0
       1
        Toy Story (1995)
       01-Jan-1995
      NaN
       http://us.imdb.com/M/title-exact?Toy%20Story%2...
    
    
      1
       2
        GoldenEye (1995)
       01-Jan-1995
      NaN
       http://us.imdb.com/M/title-exact?GoldenEye%20(...
    
    
      2
       3
       Four Rooms (1995)
       01-Jan-1995
      NaN
       http://us.imdb.com/M/title-exact?Four%20Rooms%...
    
    
      3
       4
       Get Shorty (1995)
       01-Jan-1995
      NaN
       http://us.imdb.com/M/title-exact?Get%20Shorty%...
    
    
      4
       5
          Copycat (1995)
       01-Jan-1995
      NaN
       http://us.imdb.com/M/title-exact?Copycat%20(1995)



In [44]:

    
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)
movie_ratings.head()
lens.tail()









    Out[44]:






  
    
      
      movie_id
      title
      release_date
      video_release_date
      imdb_url
      user_id
      rating
      unix_timestamp
      age
      sex
      occupation
      zip_code
    
  
  
    
      99995
       748
                Saint, The (1997)
       14-Mar-1997
      NaN
       http://us.imdb.com/M/title-exact?Saint%2C%20Th...
       729
       4
       893286638
       19
       M
       student
       56567
    
    
      99996
       751
       Tomorrow Never Dies (1997)
       01-Jan-1997
      NaN
       http://us.imdb.com/M/title-exact?imdb-title-12...
       729
       3
       893286338
       19
       M
       student
       56567
    
    
      99997
       879
           Peacemaker, The (1997)
       01-Jan-1997
      NaN
       http://us.imdb.com/M/title-exact?Peacemaker%2C...
       729
       3
       893286299
       19
       M
       student
       56567
    
    
      99998
       894
              Home Alone 3 (1997)
       01-Jan-1997
      NaN
       http://us.imdb.com/M/title-exact?imdb-title-11...
       729
       1
       893286511
       19
       M
       student
       56567
    
    
      99999
       901
                 Mr. Magoo (1997)
       25-Dec-1997
      NaN
       http://us.imdb.com/M/title-exact?imdb-title-11...
       729
       1
       893286491
       19
       M
       student
       56567



In [41]:

    
most_rated = lens.groupby('title').size().order(ascending=False)[:10]
print most_rated









    



title
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
dtype: int64



In [46]:

    
lens.title.value_counts()[:10]









    Out[46]:





Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
dtype: int64



In [136]:

    
movie_stats = lens.groupby('title').agg({'rating':[np.size, np.mean, np.median]})
movie_stats[:4]









    Out[136]:






  
    
      
      rating
    
    
      
      size
      mean
      median
    
    
      title
      
      
      
    
  
  
    
      'Til There Was You (1997)
         9
       2.333333
       2
    
    
      1-900 (1994)
         5
       2.600000
       3
    
    
      101 Dalmatians (1996)
       109
       2.908257
       3
    
    
      12 Angry Men (1957)
       125
       4.344000
       4



In [60]:

    
movie_stats.sort([('rating', 'mean')], ascending=False).head()









    Out[60]:






  
    
      
      rating
    
    
      
      size
      mean
      median
    
    
      title
      
      
      
    
  
  
    
      They Made Me a Criminal (1939)
       1
       5
       5
    
    
      Marlene Dietrich: Shadow and Light (1996) 
       1
       5
       5
    
    
      Saint of Fort Washington, The (1993)
       2
       5
       5
    
    
      Someone Else's America (1995)
       1
       5
       5
    
    
      Star Kid (1997)
       3
       5
       5



In [62]:

    
movie_stats.sort([('rating', 'size')], ascending=False).head()
#note DF use .sort  Series uses .order









    Out[62]:






  
    
      
      rating
    
    
      
      size
      mean
      median
    
    
      title
      
      
      
    
  
  
    
      Star Wars (1977)
       583
       4.358491
       5
    
    
      Contact (1997)
       509
       3.803536
       4
    
    
      Fargo (1996)
       508
       4.155512
       4
    
    
      Return of the Jedi (1983)
       507
       4.007890
       4
    
    
      Liar Liar (1997)
       485
       3.156701
       3



In [80]:

    
atleast_100 = movie_stats['rating']['size'] >= 100
#print movie_stats[atleast_100].sort([('rating', 'mean')], ascending=False)[:15]
#atleast_100
#movie_stats['rating']['size']
#movie_stats['rating'].size
atleast_100
movie_stats[atleast_100].sort([('rating', 'mean')], ascending=False)[:15]









    Out[80]:






  
    
      
      rating
    
    
      
      size
      mean
      median
    
    
      title
      
      
      
    
  
  
    
      Close Shave, A (1995)
       112
       4.491071
       5
    
    
      Schindler's List (1993)
       298
       4.466443
       5
    
    
      Wrong Trousers, The (1993)
       118
       4.466102
       5
    
    
      Casablanca (1942)
       243
       4.456790
       5
    
    
      Shawshank Redemption, The (1994)
       283
       4.445230
       5
    
    
      Rear Window (1954)
       209
       4.387560
       5
    
    
      Usual Suspects, The (1995)
       267
       4.385768
       5
    
    
      Star Wars (1977)
       583
       4.358491
       5
    
    
      12 Angry Men (1957)
       125
       4.344000
       4
    
    
      Citizen Kane (1941)
       198
       4.292929
       5
    
    
      To Kill a Mockingbird (1962)
       219
       4.292237
       4
    
    
      One Flew Over the Cuckoo's Nest (1975)
       264
       4.291667
       4
    
    
      Silence of the Lambs, The (1991)
       390
       4.289744
       4
    
    
      North by Northwest (1959)
       179
       4.284916
       4
    
    
      Godfather, The (1972)
       413
       4.283293
       5



In [85]:

    
most_50 = lens.groupby('movie_id').size().order(ascending=False)[:5]
most_50
users.age









    Out[85]:





movie_id
50          583
258         509
100         508
181         507
294         485
dtype: int64



In [138]:

    
users.age.hist(bins=30)
plt.title("Distribution of users' ages")
plt.ylabel('count of users')
plt.xlabel('age');



In [94]:

    
users.age.hist()
users.age[:11]









    Out[94]:





0     24
1     53
2     23
3     24
4     33
5     42
6     57
7     36
8     29
9     53
10    39
Name: age, dtype: int64



In [99]:

    
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
lens['age_group'] = pd.cut(lens.age, range(0, 81, 10), right=False, labels=labels) #exclusive of upper bounds in each bin
lens['age_group']
print lens[['age', 'age_group']].drop_duplicates()[:10]
#pd.cut() to bin numeric data









    



      age age_group
0      60     60-69
397    21     20-29
459    33     30-39
524    30     30-39
782    23     20-29
995    29     20-29
1229   26     20-29
1664   31     30-39
1942   24     20-29
2270   32     30-39



In [100]:

    
lens.set_index('movie_id', inplace=True)



In [103]:

    
by_age = lens.ix[most_50.index].groupby(['title', 'age_group'])
by_age.rating.mean().head(15)









    Out[103]:





title             age_group
Contact (1997)    0-9          5.000000
                  10-19        3.693878
                  20-29        3.785714
                  30-39        3.847458
                  40-49        3.866667
                  50-59        3.739130
                  60-69        3.777778
Fargo (1996)      10-19        3.937500
                  20-29        4.010471
                  30-39        4.230769
                  40-49        4.294118
                  50-59        4.442308
                  60-69        4.000000
                  70-79        4.333333
Liar Liar (1997)  0-9          4.000000
Name: rating, dtype: float64



In [109]:

    
by_age.rating.mean().unstack(1).fillna(0)









    Out[109]:






  
    
      age_group
      0-9
      10-19
      20-29
      30-39
      40-49
      50-59
      60-69
      70-79
    
    
      title
      
      
      
      
      
      
      
      
    
  
  
    
      Contact (1997)
       5
       3.693878
       3.785714
       3.847458
       3.866667
       3.739130
       3.777778
       0.000000
    
    
      Fargo (1996)
       0
       3.937500
       4.010471
       4.230769
       4.294118
       4.442308
       4.000000
       4.333333
    
    
      Liar Liar (1997)
       4
       3.191489
       3.321244
       3.080645
       2.916667
       3.000000
       3.100000
       2.000000
    
    
      Return of the Jedi (1983)
       4
       4.488372
       4.067308
       3.805970
       4.000000
       4.000000
       3.333333
       5.000000
    
    
      Star Wars (1977)
       3
       4.630435
       4.413043
       4.305732
       4.166667
       4.469388
       4.000000
       4.000000



In [110]:

    
lens.reset_index('movie_id', inplace=True)



In [112]:

    
pivoted = lens.pivot_table(index=['movie_id', 'title'],
                           cols=['sex'],
                           values='rating',
                           fill_value=0)
print pivoted.head()









    



sex                                F         M
movie_id title                                
1        Toy Story (1995)   3.789916  3.909910
2        GoldenEye (1995)   3.368421  3.178571
3        Four Rooms (1995)  2.687500  3.108108
4        Get Shorty (1995)  3.400000  3.591463
5        Copycat (1995)     3.772727  3.140625



In [113]:

    
pivoted['diff'] = pivoted.M - pivoted.F
print pivoted.head()









    



sex                                F         M      diff
movie_id title                                          
1        Toy Story (1995)   3.789916  3.909910  0.119994
2        GoldenEye (1995)   3.368421  3.178571 -0.189850
3        Four Rooms (1995)  2.687500  3.108108  0.420608
4        Get Shorty (1995)  3.400000  3.591463  0.191463
5        Copycat (1995)     3.772727  3.140625 -0.632102



In [114]:

    
pivoted.reset_index('movie_id', inplace=True)



In [137]:

    
disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff']
disagreements.order().plot(kind='barh', figsize=[9, 15])
plt.title('Male vs. Female Avg. Ratings\n(Difference > 0 = Favored by Men)')
plt.ylabel('Title')
plt.xlabel('Average Rating Difference');



In [118]:

    
SALES=pd.read_csv("sample-salesv2.csv")
SALES.head()









    Out[118]:






  
    
      
      account number
      name
      sku
      category
      quantity
      unit price
      ext price
      date
    
  
  
    
      0
       296809
                         Carroll PLC
       QN-82852
        Belt
       13
       44.48
        578.24
       2014-09-27 07:13:03
    
    
      1
        98022
                   Heidenreich-Bosco
       MJ-21460
       Shoes
       19
       53.62
       1018.78
       2014-07-29 02:10:44
    
    
      2
       563905
       Kerluke, Reilly and Bechtelar
       AS-93055
       Shirt
       12
       24.16
        289.92
       2014-03-01 10:51:24
    
    
      3
        93356
                       Waters-Walker
       AS-93055
       Shirt
        5
       82.68
        413.40
       2013-11-17 20:41:11
    
    
      4
       659366
                       Waelchi-Fahey
       AS-93055
       Shirt
       18
       99.64
       1793.52
       2014-01-03 08:14:27



In [125]:

    
report1 = SALES.pivot_table(values=['quantity'],index=['name'],columns=['category'], aggfunc=[np.sum])
report = SALES.pivot_table(values=['quantity'],index=['name'],columns=['category'], aggfunc=[np.sum, np.mean, np.median])
report1.head(n=10)









    Out[125]:






  
    
      
      sum
    
    
      
      quantity
    
    
      category
      Belt
      Shirt
      Shoes
    
    
      name
      
      
      
    
  
  
    
      Berge LLC
       120
       166
       236
    
    
      Carroll PLC
       184
       257
       236
    
    
      Cole-Eichmann
       147
       236
       155
    
    
      Davis, Kshlerin and Reilly
        33
       161
       185
    
    
      Ernser, Cruickshank and Lind
        97
       262
        96
    
    
      Gorczany-Hahn
       103
       237
       100
    
    
      Hamill-Hackett
        32
       148
       239
    
    
      Hegmann and Sons
        88
       278
       240
    
    
      Heidenreich-Bosco
       106
        92
       233
    
    
      Huel-Haag
        43
       200
       115



In [139]:

    
report = SALES.pivot_table(values=['ext price','quantity'],index=['name'],columns=['category'], aggfunc=[np.sum])
report[:2]









    Out[139]:






  
    
      
      sum
    
    
      
      ext price
      quantity
    
    
      category
      Belt
      Shirt
      Shoes
      Belt
      Shirt
      Shoes
    
    
      name
      
      
      
      
      
      
    
  
  
    
      Berge LLC
       6033.53
        9670.24
       14361.10
       120
       166
       236
    
    
      Carroll PLC
       9359.26
       13717.61
       12857.44
       184
       257
       236



In [140]:

    
sales=pd.read_csv("sample-salesv2.csv",parse_dates=['date'])
sales.head()









    Out[140]:






  
    
      
      account number
      name
      sku
      category
      quantity
      unit price
      ext price
      date
    
  
  
    
      0
       296809
                         Carroll PLC
       QN-82852
        Belt
       13
       44.48
        578.24
      2014-09-27 07:13:03
    
    
      1
        98022
                   Heidenreich-Bosco
       MJ-21460
       Shoes
       19
       53.62
       1018.78
      2014-07-29 02:10:44
    
    
      2
       563905
       Kerluke, Reilly and Bechtelar
       AS-93055
       Shirt
       12
       24.16
        289.92
      2014-03-01 10:51:24
    
    
      3
        93356
                       Waters-Walker
       AS-93055
       Shirt
        5
       82.68
        413.40
      2013-11-17 20:41:11
    
    
      4
       659366
                       Waelchi-Fahey
       AS-93055
       Shirt
       18
       99.64
       1793.52
      2014-01-03 08:14:27



In [146]:

    
#sales['ext price'] = sales['total price']
#sales.columns
#sales = sales.rename(['account number', 'name', 'sku', 'category', 'quantity', 'unit price', 'total price', 'date'])



In [147]:

    
customers = sales[['name','ext price','date']]
customers.head()









    Out[147]:






  
    
      
      name
      ext price
      date
    
  
  
    
      0
                         Carroll PLC
        578.24
      2014-09-27 07:13:03
    
    
      1
                   Heidenreich-Bosco
       1018.78
      2014-07-29 02:10:44
    
    
      2
       Kerluke, Reilly and Bechtelar
        289.92
      2014-03-01 10:51:24
    
    
      3
                       Waters-Walker
        413.40
      2013-11-17 20:41:11
    
    
      4
                       Waelchi-Fahey
       1793.52
      2014-01-03 08:14:27



In [153]:

    
customer_group = customers.groupby('name')
customer_group.size()[:6]









    Out[153]:





name
Berge LLC                       52
Carroll PLC                     57
Cole-Eichmann                   51
Davis, Kshlerin and Reilly      41
Ernser, Cruickshank and Lind    47
Gorczany-Hahn                   42
dtype: int64



In [180]:

    
sales_totals5 = customer_group.sum().sort(columns='ext price').head()

sales_totals = customer_group.sum().sort(columns='ext price')
sales_totals[:2]









    Out[180]:






  
    
      
      ext price
    
    
      name
      
    
  
  
    
      Davis, Kshlerin and Reilly
       19054.76
    
    
      Huel-Haag
       21087.88



In [163]:

    
my_plot = sales_totals.plot(kind='bar')



In [169]:

    
sales_totals5.plot(kind='barh')









    Out[169]:





<matplotlib.axes._subplots.AxesSubplot at 0x119f92d10>



In [171]:

    
sales_totals5.plot(kind='pie', subplots=True)









    Out[171]:





array([<matplotlib.axes._subplots.AxesSubplot object at 0x119d1c690>], dtype=object)



In [175]:

    
my_plot = sales_totals.sort(columns='ext price',ascending=True).plot(kind='barh', legend=None, title="Total Sales by Customer")
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales ($)")









    Out[175]:





<matplotlib.text.Text at 0x11ad4bb90>



In [183]:

    
my_plot = sales_totals5.sort(columns='ext price',ascending=True).plot(kind='pie', legend=None, title="Total Sales by Customer", x='ext price', y='ext price')



In [184]:

    
customers = sales[['name','category','ext price','date']]
customers.head()









    Out[184]:






  
    
      
      name
      category
      ext price
      date
    
  
  
    
      0
                         Carroll PLC
        Belt
        578.24
      2014-09-27 07:13:03
    
    
      1
                   Heidenreich-Bosco
       Shoes
       1018.78
      2014-07-29 02:10:44
    
    
      2
       Kerluke, Reilly and Bechtelar
       Shirt
        289.92
      2014-03-01 10:51:24
    
    
      3
                       Waters-Walker
       Shirt
        413.40
      2013-11-17 20:41:11
    
    
      4
                       Waelchi-Fahey
       Shirt
       1793.52
      2014-01-03 08:14:27



In [185]:

    
category_group=customers.groupby(['name','category']).sum()
category_group.head()









    Out[185]:






  
    
      
      
      ext price
    
    
      name
      category
      
    
  
  
    
      Berge LLC
      Belt
        6033.53
    
    
      Shirt
        9670.24
    
    
      Shoes
       14361.10
    
    
      Carroll PLC
      Belt
        9359.26
    
    
      Shirt
       13717.61



In [186]:

    
category_group.unstack().head()









    Out[186]:






  
    
      
      ext price
    
    
      category
      Belt
      Shirt
      Shoes
    
    
      name
      
      
      
    
  
  
    
      Berge LLC
       6033.53
        9670.24
       14361.10
    
    
      Carroll PLC
       9359.26
       13717.61
       12857.44
    
    
      Cole-Eichmann
       8112.70
       14528.01
        7794.71
    
    
      Davis, Kshlerin and Reilly
       1604.13
        7533.03
        9917.60
    
    
      Ernser, Cruickshank and Lind
       5894.38
       16944.19
        5250.45



In [187]:

    
my_plot = category_group.unstack().plot(kind='bar',stacked=True,title="Total Sales by Customer")
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales")









    Out[187]:





<matplotlib.text.Text at 0x11cb5c810>



In [188]:

    
my_plot = category_group.unstack().plot(kind='bar',stacked=True,title="Total Sales by Customer",figsize=(9, 7))
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales")
my_plot.legend(["Total","Belts","Shirts","Shoes"], loc=9,ncol=4)









    Out[188]:





<matplotlib.legend.Legend at 0x11f98ed10>



In [189]:

    
purchase_patterns = sales[['ext price','date']]
purchase_patterns.head()









    Out[189]:






  
    
      
      ext price
      date
    
  
  
    
      0
        578.24
      2014-09-27 07:13:03
    
    
      1
       1018.78
      2014-07-29 02:10:44
    
    
      2
        289.92
      2014-03-01 10:51:24
    
    
      3
        413.40
      2013-11-17 20:41:11
    
    
      4
       1793.52
      2014-01-03 08:14:27



In [190]:

    
purchase_plot = purchase_patterns['ext price'].hist(bins=20)
purchase_plot.set_title("Purchase Patterns")
purchase_plot.set_xlabel("Order Amount($)")
purchase_plot.set_ylabel("Number of orders")









    Out[190]:





<matplotlib.text.Text at 0x119689350>



In [191]:

    
purchase_patterns = sales[['ext price','date']]
purchase_patterns.head()









    Out[191]:






  
    
      
      ext price
      date
    
  
  
    
      0
        578.24
      2014-09-27 07:13:03
    
    
      1
       1018.78
      2014-07-29 02:10:44
    
    
      2
        289.92
      2014-03-01 10:51:24
    
    
      3
        413.40
      2013-11-17 20:41:11
    
    
      4
       1793.52
      2014-01-03 08:14:27



In [192]:

    
purchase_patterns = purchase_patterns.set_index('date')
purchase_patterns.head()









    Out[192]:






  
    
      
      ext price
    
    
      date
      
    
  
  
    
      2014-09-27 07:13:03
        578.24
    
    
      2014-07-29 02:10:44
       1018.78
    
    
      2014-03-01 10:51:24
        289.92
    
    
      2013-11-17 20:41:11
        413.40
    
    
      2014-01-03 08:14:27
       1793.52



In [197]:

    
purchase_patterns.resample('M',how=sum)  #wtf?
#purchase_patterns.resample?
#purchase_plot = purchase_patterns.resample('M',how=sum).plot(title="Total Sales by Month",legend=None)



In [199]:

    
pwd









    Out[199]:





u'/Users/me/Documents/Py/sci-py'



In [ ]:

	ext price
date
2013-10-31	14085.27
2013-11-30	44932.78
2013-12-31	57752.23
2014-01-31	53161.35
2014-02-28	51783.56
2014-03-31	44991.85
2014-04-30	34098.05
2014-05-31	50001.02
2014-06-30	37477.65
2014-07-31	41686.71
2014-08-31	55497.09
2014-09-30	57107.51
2014-10-31	37268.83

	Year	Month	Day	Time
Date/Time
2015-03-01 00:00:00	2015	3	1	00:00
2015-03-01 01:00:00	2015	3	1	01:00
2015-03-01 02:00:00	2015	3	1	02:00

	user_id	movie_id	rating	unix_timestamp
0	196	242	3	881250949
1	186	302	3	891717742
2	22	377	1	878887116
3	244	51	2	880606923
4	166	346	1	886397596

	movie_id	title	release_date	video_release_date	imdb_url
0	1	Toy Story (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Toy%20Story%2...
1	2	GoldenEye (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?GoldenEye%20(...
2	3	Four Rooms (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Four%20Rooms%...
3	4	Get Shorty (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Get%20Shorty%...
4	5	Copycat (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Copycat%20(1995)

	movie_id	title	release_date	video_release_date	imdb_url	user_id	rating	unix_timestamp	age	sex	occupation	zip_code
99995	748	Saint, The (1997)	14-Mar-1997	NaN	http://us.imdb.com/M/title-exact?Saint%2C%20Th...	729	4	893286638	19	M	student	56567
99996	751	Tomorrow Never Dies (1997)	01-Jan-1997	NaN	http://us.imdb.com/M/title-exact?imdb-title-12...	729	3	893286338	19	M	student	56567
99997	879	Peacemaker, The (1997)	01-Jan-1997	NaN	http://us.imdb.com/M/title-exact?Peacemaker%2C...	729	3	893286299	19	M	student	56567
99998	894	Home Alone 3 (1997)	01-Jan-1997	NaN	http://us.imdb.com/M/title-exact?imdb-title-11...	729	1	893286511	19	M	student	56567
99999	901	Mr. Magoo (1997)	25-Dec-1997	NaN	http://us.imdb.com/M/title-exact?imdb-title-11...	729	1	893286491	19	M	student	56567

	rating
	size	mean	median
title
'Til There Was You (1997)	9	2.333333	2
1-900 (1994)	5	2.600000	3
101 Dalmatians (1996)	109	2.908257	3
12 Angry Men (1957)	125	4.344000	4

age_group	0-9	10-19	20-29	30-39	40-49	50-59	60-69	70-79
title
Contact (1997)	5	3.693878	3.785714	3.847458	3.866667	3.739130	3.777778	0.000000
Fargo (1996)	0	3.937500	4.010471	4.230769	4.294118	4.442308	4.000000	4.333333
Liar Liar (1997)	4	3.191489	3.321244	3.080645	2.916667	3.000000	3.100000	2.000000
Return of the Jedi (1983)	4	4.488372	4.067308	3.805970	4.000000	4.000000	3.333333	5.000000
Star Wars (1977)	3	4.630435	4.413043	4.305732	4.166667	4.469388	4.000000	4.000000

	account number	name	sku	category	quantity	unit price	ext price	date
0	296809	Carroll PLC	QN-82852	Belt	13	44.48	578.24	2014-09-27 07:13:03
1	98022	Heidenreich-Bosco	MJ-21460	Shoes	19	53.62	1018.78	2014-07-29 02:10:44
2	563905	Kerluke, Reilly and Bechtelar	AS-93055	Shirt	12	24.16	289.92	2014-03-01 10:51:24
3	93356	Waters-Walker	AS-93055	Shirt	5	82.68	413.40	2013-11-17 20:41:11
4	659366	Waelchi-Fahey	AS-93055	Shirt	18	99.64	1793.52	2014-01-03 08:14:27

	sum
	quantity
category	Belt	Shirt	Shoes
name
Berge LLC	120	166	236
Carroll PLC	184	257	236
Cole-Eichmann	147	236	155
Davis, Kshlerin and Reilly	33	161	185
Ernser, Cruickshank and Lind	97	262	96
Gorczany-Hahn	103	237	100
Hamill-Hackett	32	148	239
Hegmann and Sons	88	278	240
Heidenreich-Bosco	106	92	233
Huel-Haag	43	200	115