最近开始学习pandas用来进行数据分析的入门,这里将一些东西总结为一个notebook以方便查看。

下载数据


In [ ]:
# TODO

In [ ]:
%matplotlib inline

In [ ]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import seaborn as sns # 引入seaborn让图表更现代

分析网页请求数据


In [ ]:
import json
# 数据来源: http://1usagov.measuredvoice.com/2013/
with open('usagov_bitly_data2013-05-17-1368832207') as fp:
    records = map(json.loads, fp)

In [ ]:
data = DataFrame(records) # 以frame形式使用数据

clean_tz = data['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'

# data[(data['tz'] == '') & (data['al'] == 'en')] # filter data

In [ ]:
clean_tz.value_counts()[:15].plot(kind='barh', figsize=(12, 5)) # figsize 调整大小,单位为inch

In [ ]:
agents = Series([x.split(' ', 1)[0] for x in data.a.dropna()])
agents.value_counts(ascending=True)[-15:].plot(kind='barh', figsize=(12, 5), logx=True) # logx=True 使用对数坐标

In [ ]:
# 找出Windows和非Windows用户的比例
cframe = data[data.a.notnull()] # 过滤

operating_systems = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')

by_tz_os = cframe.groupby(['tz', operating_systems])
agg_counts = by_tz_os.size().unstack().fillna(0)
# 排序
indexer = agg_counts.sum(1).argsort()
indexer[:10] # 最前面10个数据
count_subset = agg_counts.take(indexer)[-10:] # 最后10个数据
count_subset.plot(kind='barh', stacked=True)

In [ ]:
# 看比例而不看数据
count_subset.div(count_subset.sum(1), axis=0).plot(kind='barh', stacked=True, figsize=(10, 5))

分析电影评分数据


In [ ]:
# 数据来源 http://grouplens.org/datasets/movielens/1m/
# 用户数据
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('ml-1m/users.dat', sep='::', header=None, names=unames, engine='python')
# 评分数据
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header=None, names=rnames, engine='python')
# 电影数据
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('ml-1m/movies.dat', sep='::', header=None, names=mnames, engine='python')
# 合并为最终数据
mldata = pd.merge(pd.merge(ratings, users), movies)

In [ ]:
# 使用数据
mean_ratings = mldata.pivot_table('rating', index='title', columns=['gender'], aggfunc='mean') # 得到每部电影按性别划分的评分标准

In [ ]:
# 以title分组的rating
# 过滤出至少有250条评分记录的电影数据
ratings_by_title = mldata.groupby('title').size()
active_titles = ratings_by_title.index[ratings_by_title >= 250]
mean_ratings = mean_ratings.ix[active_titles]

In [ ]:
# 女性最喜欢的N部电影
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]

In [ ]:
# 分歧最大的电影
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sort_by_diff = mean_ratings.dropna().sort_values(by='diff', ascending=False)
sort_by_diff[:10]

In [ ]:
# 不考虑性别因素分歧最大的电影列表
mldata.groupby('title')['rating'].std().ix[active_titles].sort_values(ascending=False)[:10]

分析新生儿姓名数据


In [ ]:
# 数据来源: https://www.ssa.gov/oact/babynames/limits.html
from glob import glob
import re

year_ptn = re.compile(r'\d+')
pieces = []
for file_path in glob('names/*.txt'):
    names_year = pd.read_csv(file_path, names=['name', 'sex', 'births'])
    names_year['year'] = int(year_ptn.search(file_path).group())
    pieces.append(names_year)

names = pd.concat(pieces, ignore_index=True) # ignore_index=True 之后将不会返回原始行号

In [ ]:
total_births = names.pivot_table('births', index='year', columns=['sex'], aggfunc=sum)
total_births.plot(title='Total births by sex and year', figsize=(12, 5))

In [ ]:
# 某个名字在历史上的出生人数
names[names.name == 'Mike'].pivot_table('births', index='year').plot()

In [ ]:
# 指定名字的婴儿出生数占总出生数的比例
def add_prop(group):
    births = group.births.astype(float)
    group['prop'] = births/births.sum()
    
    return group

names = names.groupby(['year', 'sex']).apply(add_prop) # seems "map"?
# 有效性检查,所有名字的比例之和为1
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

In [ ]:
# 取sex/year组合的前1000个名字
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

In [ ]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)

In [ ]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 12), grid=False, title='Number of births per year')

In [ ]:
# top 1000姓名的新生儿占全部新生儿的比例
table = top1000.pivot_table('prop', index='year', columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13),
           figsize=(12, 5),
           xticks=range(1880, 2020, 10))

In [ ]:
# 50%人数所需要的名字数
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex').astype(int)
diversity.plot(title='Number of popular names in top 50%', figsize=(12, 5))

In [ ]:
top1000.name.unique()

numpy


In [ ]:
arr = np.array([[1,2,3], [4,5,6], [7,8,9]])

In [ ]:
# 下标批量修改
arr[0] = 1
arr[1:3] = 2
arr

In [ ]:
# 矩阵访问
arr = np.array([[1,2,3], [4,5,6], [7,8,9]])
arr[1, 0] # 4
arr[2, 0] # 7

In [ ]:
# 切片
arr = np.array([[1,2,3], [4,5,6], [7,8,9]])
arr[1:3]
arr[:2, 1:]
arr[:, :1]

In [ ]:
# 布尔型索引
names = np.array(['bob', 'jerry', 'mike', 'tom', 'jerry', 'bob'])