In [198]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#pd.set_option('display.mpl_style', 'default') #deprecated I believe
pd.options.display.mpl_style = 'default'
%matplotlib inline
In [14]:
#weather_2012_final = pd.read_csv('mtl-temps.csv', index_col='Date/Time')
#weather_2012_final.plot()
In [17]:
url_template = "http://climate.weather.gc.ca/climateData/bulkdata_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit=Download+Data"
url = url_template.format(month=3, year=2012)
weather_mar2012 = pd.read_csv('mtl-temps.csv', skiprows=16, index_col='Date/Time', parse_dates=True, encoding='latin1')
In [135]:
weather_mar2012.dropna(axis=1, how='any')[:3]
Out[135]:
In [25]:
import pandas.io.data as web
import datetime
start = datetime.datetime(2013, 1, 1)
end = datetime.datetime(2015, 1, 27)
data = web.DataReader?
In [ ]:
data = web.DataReader
In [26]:
from pandas.io import wb
In [27]:
wb.search()
In [28]:
pwd
Out[28]:
In [29]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols)
In [36]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols)
ratings.head()
Out[36]:
In [37]:
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5))
movies.head()
Out[37]:
In [44]:
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)
movie_ratings.head()
lens.tail()
Out[44]:
In [41]:
most_rated = lens.groupby('title').size().order(ascending=False)[:10]
print most_rated
In [46]:
lens.title.value_counts()[:10]
Out[46]:
In [136]:
movie_stats = lens.groupby('title').agg({'rating':[np.size, np.mean, np.median]})
movie_stats[:4]
Out[136]:
In [60]:
movie_stats.sort([('rating', 'mean')], ascending=False).head()
Out[60]:
In [62]:
movie_stats.sort([('rating', 'size')], ascending=False).head()
#note DF use .sort Series uses .order
Out[62]:
In [80]:
atleast_100 = movie_stats['rating']['size'] >= 100
#print movie_stats[atleast_100].sort([('rating', 'mean')], ascending=False)[:15]
#atleast_100
#movie_stats['rating']['size']
#movie_stats['rating'].size
atleast_100
movie_stats[atleast_100].sort([('rating', 'mean')], ascending=False)[:15]
Out[80]:
In [85]:
most_50 = lens.groupby('movie_id').size().order(ascending=False)[:5]
most_50
users.age
Out[85]:
In [138]:
users.age.hist(bins=30)
plt.title("Distribution of users' ages")
plt.ylabel('count of users')
plt.xlabel('age');
In [94]:
users.age.hist()
users.age[:11]
Out[94]:
In [99]:
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
lens['age_group'] = pd.cut(lens.age, range(0, 81, 10), right=False, labels=labels) #exclusive of upper bounds in each bin
lens['age_group']
print lens[['age', 'age_group']].drop_duplicates()[:10]
#pd.cut() to bin numeric data
In [100]:
lens.set_index('movie_id', inplace=True)
In [103]:
by_age = lens.ix[most_50.index].groupby(['title', 'age_group'])
by_age.rating.mean().head(15)
Out[103]:
In [109]:
by_age.rating.mean().unstack(1).fillna(0)
Out[109]:
In [110]:
lens.reset_index('movie_id', inplace=True)
In [112]:
pivoted = lens.pivot_table(index=['movie_id', 'title'],
cols=['sex'],
values='rating',
fill_value=0)
print pivoted.head()
In [113]:
pivoted['diff'] = pivoted.M - pivoted.F
print pivoted.head()
In [114]:
pivoted.reset_index('movie_id', inplace=True)
In [137]:
disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff']
disagreements.order().plot(kind='barh', figsize=[9, 15])
plt.title('Male vs. Female Avg. Ratings\n(Difference > 0 = Favored by Men)')
plt.ylabel('Title')
plt.xlabel('Average Rating Difference');
In [118]:
SALES=pd.read_csv("sample-salesv2.csv")
SALES.head()
Out[118]:
In [125]:
report1 = SALES.pivot_table(values=['quantity'],index=['name'],columns=['category'], aggfunc=[np.sum])
report = SALES.pivot_table(values=['quantity'],index=['name'],columns=['category'], aggfunc=[np.sum, np.mean, np.median])
report1.head(n=10)
Out[125]:
In [139]:
report = SALES.pivot_table(values=['ext price','quantity'],index=['name'],columns=['category'], aggfunc=[np.sum])
report[:2]
Out[139]:
In [140]:
sales=pd.read_csv("sample-salesv2.csv",parse_dates=['date'])
sales.head()
Out[140]:
In [146]:
#sales['ext price'] = sales['total price']
#sales.columns
#sales = sales.rename(['account number', 'name', 'sku', 'category', 'quantity', 'unit price', 'total price', 'date'])
In [147]:
customers = sales[['name','ext price','date']]
customers.head()
Out[147]:
In [153]:
customer_group = customers.groupby('name')
customer_group.size()[:6]
Out[153]:
In [180]:
sales_totals5 = customer_group.sum().sort(columns='ext price').head()
sales_totals = customer_group.sum().sort(columns='ext price')
sales_totals[:2]
Out[180]:
In [163]:
my_plot = sales_totals.plot(kind='bar')
In [169]:
sales_totals5.plot(kind='barh')
Out[169]:
In [171]:
sales_totals5.plot(kind='pie', subplots=True)
Out[171]:
In [175]:
my_plot = sales_totals.sort(columns='ext price',ascending=True).plot(kind='barh', legend=None, title="Total Sales by Customer")
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales ($)")
Out[175]:
In [183]:
my_plot = sales_totals5.sort(columns='ext price',ascending=True).plot(kind='pie', legend=None, title="Total Sales by Customer", x='ext price', y='ext price')
In [184]:
customers = sales[['name','category','ext price','date']]
customers.head()
Out[184]:
In [185]:
category_group=customers.groupby(['name','category']).sum()
category_group.head()
Out[185]:
In [186]:
category_group.unstack().head()
Out[186]:
In [187]:
my_plot = category_group.unstack().plot(kind='bar',stacked=True,title="Total Sales by Customer")
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales")
Out[187]:
In [188]:
my_plot = category_group.unstack().plot(kind='bar',stacked=True,title="Total Sales by Customer",figsize=(9, 7))
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales")
my_plot.legend(["Total","Belts","Shirts","Shoes"], loc=9,ncol=4)
Out[188]:
In [189]:
purchase_patterns = sales[['ext price','date']]
purchase_patterns.head()
Out[189]:
In [190]:
purchase_plot = purchase_patterns['ext price'].hist(bins=20)
purchase_plot.set_title("Purchase Patterns")
purchase_plot.set_xlabel("Order Amount($)")
purchase_plot.set_ylabel("Number of orders")
Out[190]:
In [191]:
purchase_patterns = sales[['ext price','date']]
purchase_patterns.head()
Out[191]:
In [192]:
purchase_patterns = purchase_patterns.set_index('date')
purchase_patterns.head()
Out[192]:
In [197]:
purchase_patterns.resample('M',how=sum) #wtf?
#purchase_patterns.resample?
#purchase_plot = purchase_patterns.resample('M',how=sum).plot(title="Total Sales by Month",legend=None)
Out[197]:
In [199]:
pwd
Out[199]:
In [ ]: