最近开始学习pandas用来进行数据分析的入门,这里将一些东西总结为一个notebook以方便查看。
In [ ]:
# TODO
In [ ]:
%matplotlib inline
In [ ]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import seaborn as sns # 引入seaborn让图表更现代
In [ ]:
import json
# 数据来源: http://1usagov.measuredvoice.com/2013/
with open('usagov_bitly_data2013-05-17-1368832207') as fp:
records = map(json.loads, fp)
In [ ]:
data = DataFrame(records) # 以frame形式使用数据
clean_tz = data['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
# data[(data['tz'] == '') & (data['al'] == 'en')] # filter data
In [ ]:
clean_tz.value_counts()[:15].plot(kind='barh', figsize=(12, 5)) # figsize 调整大小,单位为inch
In [ ]:
agents = Series([x.split(' ', 1)[0] for x in data.a.dropna()])
agents.value_counts(ascending=True)[-15:].plot(kind='barh', figsize=(12, 5), logx=True) # logx=True 使用对数坐标
In [ ]:
# 找出Windows和非Windows用户的比例
cframe = data[data.a.notnull()] # 过滤
operating_systems = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
by_tz_os = cframe.groupby(['tz', operating_systems])
agg_counts = by_tz_os.size().unstack().fillna(0)
# 排序
indexer = agg_counts.sum(1).argsort()
indexer[:10] # 最前面10个数据
count_subset = agg_counts.take(indexer)[-10:] # 最后10个数据
count_subset.plot(kind='barh', stacked=True)
In [ ]:
# 看比例而不看数据
count_subset.div(count_subset.sum(1), axis=0).plot(kind='barh', stacked=True, figsize=(10, 5))
In [ ]:
# 数据来源 http://grouplens.org/datasets/movielens/1m/
# 用户数据
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('ml-1m/users.dat', sep='::', header=None, names=unames, engine='python')
# 评分数据
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header=None, names=rnames, engine='python')
# 电影数据
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('ml-1m/movies.dat', sep='::', header=None, names=mnames, engine='python')
# 合并为最终数据
mldata = pd.merge(pd.merge(ratings, users), movies)
In [ ]:
# 使用数据
mean_ratings = mldata.pivot_table('rating', index='title', columns=['gender'], aggfunc='mean') # 得到每部电影按性别划分的评分标准
In [ ]:
# 以title分组的rating
# 过滤出至少有250条评分记录的电影数据
ratings_by_title = mldata.groupby('title').size()
active_titles = ratings_by_title.index[ratings_by_title >= 250]
mean_ratings = mean_ratings.ix[active_titles]
In [ ]:
# 女性最喜欢的N部电影
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]
In [ ]:
# 分歧最大的电影
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sort_by_diff = mean_ratings.dropna().sort_values(by='diff', ascending=False)
sort_by_diff[:10]
In [ ]:
# 不考虑性别因素分歧最大的电影列表
mldata.groupby('title')['rating'].std().ix[active_titles].sort_values(ascending=False)[:10]
In [ ]:
# 数据来源: https://www.ssa.gov/oact/babynames/limits.html
from glob import glob
import re
year_ptn = re.compile(r'\d+')
pieces = []
for file_path in glob('names/*.txt'):
names_year = pd.read_csv(file_path, names=['name', 'sex', 'births'])
names_year['year'] = int(year_ptn.search(file_path).group())
pieces.append(names_year)
names = pd.concat(pieces, ignore_index=True) # ignore_index=True 之后将不会返回原始行号
In [ ]:
total_births = names.pivot_table('births', index='year', columns=['sex'], aggfunc=sum)
total_births.plot(title='Total births by sex and year', figsize=(12, 5))
In [ ]:
# 某个名字在历史上的出生人数
names[names.name == 'Mike'].pivot_table('births', index='year').plot()
In [ ]:
# 指定名字的婴儿出生数占总出生数的比例
def add_prop(group):
births = group.births.astype(float)
group['prop'] = births/births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop) # seems "map"?
# 有效性检查,所有名字的比例之和为1
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
In [ ]:
# 取sex/year组合的前1000个名字
def get_top1000(group):
return group.sort_values(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
In [ ]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
In [ ]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 12), grid=False, title='Number of births per year')
In [ ]:
# top 1000姓名的新生儿占全部新生儿的比例
table = top1000.pivot_table('prop', index='year', columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
yticks=np.linspace(0, 1.2, 13),
figsize=(12, 5),
xticks=range(1880, 2020, 10))
In [ ]:
# 50%人数所需要的名字数
def get_quantile_count(group, q=0.5):
group = group.sort_values(by='prop', ascending=False)
return group.prop.cumsum().searchsorted(q) + 1
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex').astype(int)
diversity.plot(title='Number of popular names in top 50%', figsize=(12, 5))
In [ ]:
top1000.name.unique()
In [ ]:
arr = np.array([[1,2,3], [4,5,6], [7,8,9]])
In [ ]:
# 下标批量修改
arr[0] = 1
arr[1:3] = 2
arr
In [ ]:
# 矩阵访问
arr = np.array([[1,2,3], [4,5,6], [7,8,9]])
arr[1, 0] # 4
arr[2, 0] # 7
In [ ]:
# 切片
arr = np.array([[1,2,3], [4,5,6], [7,8,9]])
arr[1:3]
arr[:2, 1:]
arr[:, :1]
In [ ]:
# 布尔型索引
names = np.array(['bob', 'jerry', 'mike', 'tom', 'jerry', 'bob'])