In [1]:
import pandas as pd
In [2]:
CSV_URL = 'https://raw.githubusercontent.com/pyfirst/samplecode/master/4_scraping/lego_scraper/brickset2016.csv'
lego_df = pd.read_csv(CSV_URL)
lego_df.head()
Out[2]:
In [3]:
lego_df.dtypes
Out[3]:
In [4]:
lego_df = pd.read_csv(CSV_URL, dtype={'number': 'object'})
lego_df.head()
Out[4]:
In [5]:
lego_df.tail()
Out[5]:
In [6]:
print('全体の行数: ', len(lego_df))
lego_df.isnull().sum()
Out[6]:
In [7]:
print('dropna 実行前: ', len(lego_df))
lego_df.dropna(subset=['number', 'us_price', 'pieces'], inplace=True)
print('dropna 実行後: ', len(lego_df))
lego_df.isnull().sum()
Out[7]:
In [8]:
lego_df.set_index('number', inplace=True)
lego_df.head(2)
Out[8]:
In [9]:
lego_df[['theme', 'name', 'us_price']].head()
Out[9]:
In [10]:
lego_df.iloc[0:4, 0:3]
Out[10]:
In [11]:
lego_df.loc[['10253', '10254'], ['year', 'us_price']]
Out[11]:
In [12]:
lego_df[['name', 'us_price']][lego_df['us_price'] > 300]
Out[12]:
In [13]:
lego_df.describe()
Out[13]:
In [14]:
lego_df.describe().round(1)
Out[14]:
In [15]:
lego_df['unit_price'] = round(lego_df['us_price'] / lego_df['pieces'], 4)
lego_df[['pieces', 'us_price', 'unit_price']].head(3)
Out[15]:
In [16]:
count_theme = lego_df.groupby('theme')['theme'].count()
count_theme.sort_values(ascending=False).head()
Out[16]:
In [17]:
%matplotlib inline
In [18]:
theme_ranking10 = count_theme.sort_values(ascending=False).head(10)
theme_ranking10.plot.bar(figsize=(10, 5))
Out[18]:
In [19]:
lego_df['us_price'].plot.hist(bins=20, grid=True, figsize=(8, 5))
Out[19]:
In [20]:
lego_df.plot.scatter(x='pieces', y='us_price', grid=True, figsize=(5, 5))
Out[20]:
In [21]:
# 右記のコードは誤り: lego_df.plot.scatter(x='pieces', y='owner', grid=True, figsize=(5, 5))
lego_df.plot.scatter(x='us_price', y='owner', grid=True, figsize=(5, 5))
Out[21]:
In [22]:
from pandas.plotting import scatter_matrix
scatter_matrix(lego_df[['pieces', 'us_price', 'owner', 'want_it']], figsize=(8, 8))
Out[22]:
In [23]:
lego_df[['minifigs', 'pieces', 'us_price', 'owner', 'want_it']].corr().round(4)
Out[23]:
In [24]:
from scipy.stats import pearsonr, linregress
pearsonr(lego_df['pieces'], lego_df['us_price'])
Out[24]:
In [25]:
linregress(lego_df['pieces'], lego_df['us_price'])
Out[25]:
In [26]:
slope, intercept, rvalue, pvalue, stderr = linregress(
lego_df['pieces'],
lego_df['us_price'])
pieces = 500
# Y = aX + b
price = slope * pieces + intercept
round(price)
Out[26]:
In [27]:
SLOPE, INTERCEPT, rvalue, pvalue, stder = linregress(
lego_df['pieces'],
lego_df['us_price'])
def get_price(pieces):
return pieces * SLOPE + INTERCEPT
In [28]:
from matplotlib import pyplot as plt
figure = plt.figure(figsize=(8, 8))
ax1 = figure.add_subplot(111)
lego_df.plot.scatter(ax=ax1, x='pieces', y='us_price', grid=True, color='c')
max_piece = lego_df['pieces'].max()
x = [0, max_piece]
y = [get_price(0), get_price(max_piece)]
ax1.plot(x, y)
plt.show()
In [ ]: