実践 レゴデータ分析[データ探索編]

List 5.6


In [1]:
import pandas as pd

List 5.8


In [2]:
CSV_URL = 'https://raw.githubusercontent.com/pyfirst/samplecode/master/4_scraping/lego_scraper/brickset2016.csv'
lego_df = pd.read_csv(CSV_URL)
lego_df.head()


Out[2]:
number name image theme subtheme year rating pieces minifigs us_price eu_price owner want_it
0 10251.0 Brick Bank https://images.brickset.com/sets/small/10251-1... Advanced Models Modular Buildings 2016 4.6 2380.0 5.0 169.99 149.99 7695.0 4949.0
1 10252.0 Volkswagen Beetle https://images.brickset.com/sets/small/10252-1... Advanced Models Vehicles 2016 5.0 1167.0 NaN 99.99 89.99 4185.0 2502.0
2 10253.0 Big Ben https://images.brickset.com/sets/small/10253-1... Advanced Models Buildings 2016 5.0 4163.0 NaN 249.99 219.99 2323.0 2916.0
3 10254.0 Winter Holiday Train https://images.brickset.com/sets/small/10254-1... Advanced Models Winter Village 2016 4.6 734.0 5.0 99.99 89.99 4955.0 1900.0
4 10654.0 XL Creative Brick Box https://images.brickset.com/sets/small/10654-1... Classic Basic Set 2016 NaN 1600.0 NaN NaN 59.99 279.0 170.0

List 5.9


In [3]:
lego_df.dtypes


Out[3]:
number      float64
name         object
image        object
theme        object
subtheme     object
year          int64
rating      float64
pieces      float64
minifigs    float64
us_price    float64
eu_price    float64
owner       float64
want_it     float64
dtype: object

List 5.10


In [4]:
lego_df = pd.read_csv(CSV_URL, dtype={'number': 'object'})
lego_df.head()


Out[4]:
number name image theme subtheme year rating pieces minifigs us_price eu_price owner want_it
0 10251 Brick Bank https://images.brickset.com/sets/small/10251-1... Advanced Models Modular Buildings 2016 4.6 2380.0 5.0 169.99 149.99 7695.0 4949.0
1 10252 Volkswagen Beetle https://images.brickset.com/sets/small/10252-1... Advanced Models Vehicles 2016 5.0 1167.0 NaN 99.99 89.99 4185.0 2502.0
2 10253 Big Ben https://images.brickset.com/sets/small/10253-1... Advanced Models Buildings 2016 5.0 4163.0 NaN 249.99 219.99 2323.0 2916.0
3 10254 Winter Holiday Train https://images.brickset.com/sets/small/10254-1... Advanced Models Winter Village 2016 4.6 734.0 5.0 99.99 89.99 4955.0 1900.0
4 10654 XL Creative Brick Box https://images.brickset.com/sets/small/10654-1... Classic Basic Set 2016 NaN 1600.0 NaN NaN 59.99 279.0 170.0

List 5.11


In [5]:
lego_df.tail()


Out[5]:
number name image theme subtheme year rating pieces minifigs us_price eu_price owner want_it
823 NaN Cogsworth https://images.brickset.com/sets/small/TRUCOGS... Disney Beauty and the Beast 2016 4.0 47.0 NaN NaN NaN 174.0 151.0
824 NaN Millennium Falcon https://images.brickset.com/sets/small/TRUFALC... Star Wars Promotional 2016 4.0 44.0 NaN NaN NaN 443.0 274.0
825 NaN Lumiere https://images.brickset.com/sets/small/TRULUMI... Disney Beauty and the Beast 2016 5.0 23.0 NaN NaN NaN 191.0 151.0
826 NaN Nexo Knights Shield Dock https://images.brickset.com/sets/small/TRUNEXO... Nexo Knights Miscellaneous 2016 4.0 41.0 NaN NaN NaN 244.0 207.0
827 NaN Winner's Podium https://images.brickset.com/sets/small/TRUPODI... Miscellaneous NaN 2016 NaN NaN NaN NaN NaN 82.0 60.0

List 5.12


In [6]:
print('全体の行数: ', len(lego_df))
lego_df.isnull().sum()


全体の行数:  828
Out[6]:
number      104
name          0
image        20
theme         0
subtheme    116
year          0
rating      509
pieces      256
minifigs    429
us_price    304
eu_price    356
owner        13
want_it      13
dtype: int64

List 5.13


In [7]:
print('dropna 実行前: ', len(lego_df))

lego_df.dropna(subset=['number', 'us_price', 'pieces'], inplace=True)

print('dropna 実行後: ', len(lego_df))

lego_df.isnull().sum()


dropna 実行前:  828
dropna 実行後:  438
Out[7]:
number        0
name          0
image         0
theme         0
subtheme     74
year          0
rating      177
pieces        0
minifigs    118
us_price      0
eu_price     63
owner         0
want_it       0
dtype: int64

List 5.14


In [8]:
lego_df.set_index('number', inplace=True)
lego_df.head(2)


Out[8]:
name image theme subtheme year rating pieces minifigs us_price eu_price owner want_it
number
10251 Brick Bank https://images.brickset.com/sets/small/10251-1... Advanced Models Modular Buildings 2016 4.6 2380.0 5.0 169.99 149.99 7695.0 4949.0
10252 Volkswagen Beetle https://images.brickset.com/sets/small/10252-1... Advanced Models Vehicles 2016 5.0 1167.0 NaN 99.99 89.99 4185.0 2502.0

List 5.15


In [9]:
lego_df[['theme', 'name', 'us_price']].head()


Out[9]:
theme name us_price
number
10251 Advanced Models Brick Bank 169.99
10252 Advanced Models Volkswagen Beetle 99.99
10253 Advanced Models Big Ben 249.99
10254 Advanced Models Winter Holiday Train 99.99
10702 Classic Creative Building Set 29.99

List 5.17


In [10]:
lego_df.iloc[0:4, 0:3]


Out[10]:
name image theme
number
10251 Brick Bank https://images.brickset.com/sets/small/10251-1... Advanced Models
10252 Volkswagen Beetle https://images.brickset.com/sets/small/10252-1... Advanced Models
10253 Big Ben https://images.brickset.com/sets/small/10253-1... Advanced Models
10254 Winter Holiday Train https://images.brickset.com/sets/small/10254-1... Advanced Models

List 5.18


In [11]:
lego_df.loc[['10253', '10254'], ['year', 'us_price']]


Out[11]:
year us_price
number
10253 2016 249.99
10254 2016 99.99

List 5.19


In [12]:
lego_df[['name', 'us_price']][lego_df['us_price'] > 300]


Out[12]:
name us_price
number
71040 Disney Castle 349.99
75159 Death Star 499.99
75827 Firehouse Headquarters 349.99

List 5.21


In [13]:
lego_df.describe()


Out[13]:
year rating pieces minifigs us_price eu_price owner want_it
count 438.0 261.000000 438.000000 320.000000 438.000000 375.000000 438.000000 438.000000
mean 2016.0 4.343678 309.408676 2.778125 34.856438 36.534267 2025.689498 723.933790
std 0.0 0.678862 549.443492 2.630450 51.375083 49.493712 1579.850469 609.538204
min 2016.0 1.700000 3.000000 1.000000 3.490000 1.990000 26.000000 21.000000
25% 2016.0 4.000000 63.000000 1.000000 9.990000 9.990000 833.500000 333.250000
50% 2016.0 4.500000 114.000000 2.000000 19.990000 19.990000 1602.000000 555.500000
75% 2016.0 5.000000 344.500000 4.000000 39.990000 39.990000 2954.250000 941.250000
max 2016.0 5.000000 4634.000000 27.000000 499.990000 449.990000 8300.000000 4949.000000

List 5.22


In [14]:
lego_df.describe().round(1)


Out[14]:
year rating pieces minifigs us_price eu_price owner want_it
count 438.0 261.0 438.0 320.0 438.0 375.0 438.0 438.0
mean 2016.0 4.3 309.4 2.8 34.9 36.5 2025.7 723.9
std 0.0 0.7 549.4 2.6 51.4 49.5 1579.9 609.5
min 2016.0 1.7 3.0 1.0 3.5 2.0 26.0 21.0
25% 2016.0 4.0 63.0 1.0 10.0 10.0 833.5 333.2
50% 2016.0 4.5 114.0 2.0 20.0 20.0 1602.0 555.5
75% 2016.0 5.0 344.5 4.0 40.0 40.0 2954.2 941.2
max 2016.0 5.0 4634.0 27.0 500.0 450.0 8300.0 4949.0

List 5.23


In [15]:
lego_df['unit_price'] = round(lego_df['us_price'] / lego_df['pieces'], 4)
lego_df[['pieces', 'us_price', 'unit_price']].head(3)


Out[15]:
pieces us_price unit_price
number
10251 2380.0 169.99 0.0714
10252 1167.0 99.99 0.0857
10253 4163.0 249.99 0.0601

5.4 実践 レゴデータ分析[データ可視化、分析編]

List 5.24


In [16]:
count_theme = lego_df.groupby('theme')['theme'].count()
count_theme.sort_values(ascending=False).head()


Out[16]:
theme
Collectable Minifigures    50
Star Wars                  44
City                       38
Nexo Knights               30
Duplo                      29
Name: theme, dtype: int64

List 5.25


In [17]:
%matplotlib inline

List 5.26


In [18]:
theme_ranking10 = count_theme.sort_values(ascending=False).head(10)
theme_ranking10.plot.bar(figsize=(10, 5))


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x111e0e400>

List 5.27


In [19]:
lego_df['us_price'].plot.hist(bins=20, grid=True, figsize=(8, 5))


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1151364a8>

List 5.28


In [20]:
lego_df.plot.scatter(x='pieces', y='us_price', grid=True, figsize=(5, 5))


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1151c2240>

List 5.29


In [21]:
# 右記のコードは誤り: lego_df.plot.scatter(x='pieces', y='owner', grid=True, figsize=(5, 5))
lego_df.plot.scatter(x='us_price', y='owner', grid=True, figsize=(5, 5))


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x1151aad30>

List 5.30


In [22]:
from pandas.plotting import scatter_matrix
scatter_matrix(lego_df[['pieces', 'us_price', 'owner', 'want_it']], figsize=(8, 8))


Out[22]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1153cb128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x115523a58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1155599e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x115594a58>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1155cea58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1155cea90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11563d0b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1156760b8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1156b10b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1156ea0b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x115714a58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11574df28>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1157904a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1157c99e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1157f57b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11582d828>]], dtype=object)

List 5.31


In [23]:
lego_df[['minifigs', 'pieces', 'us_price', 'owner', 'want_it']].corr().round(4)


Out[23]:
minifigs pieces us_price owner want_it
minifigs 1.0000 0.7002 0.7606 -0.1161 0.4841
pieces 0.7002 1.0000 0.9446 0.0027 0.6595
us_price 0.7606 0.9446 1.0000 -0.0651 0.6242
owner -0.1161 0.0027 -0.0651 1.0000 0.4951
want_it 0.4841 0.6595 0.6242 0.4951 1.0000

List 5.32


In [24]:
from scipy.stats import pearsonr, linregress
pearsonr(lego_df['pieces'], lego_df['us_price'])


Out[24]:
(0.94462114898306215, 4.1838940534637179e-213)

List 5.33


In [25]:
linregress(lego_df['pieces'], lego_df['us_price'])


Out[25]:
LinregressResult(slope=0.088325716470023505, intercept=7.5276953841688261, rvalue=0.94462114898306204, pvalue=4.183894053465568e-213, stderr=0.0014695207825678772)

List 5.34


In [26]:
slope, intercept, rvalue, pvalue, stderr = linregress(
    lego_df['pieces'],
    lego_df['us_price'])
pieces = 500
# Y = aX + b
price = slope * pieces + intercept
round(price)


Out[26]:
52.0

List 5.35


In [27]:
SLOPE, INTERCEPT, rvalue, pvalue, stder = linregress(
    lego_df['pieces'],
    lego_df['us_price'])

def get_price(pieces):
    return pieces * SLOPE + INTERCEPT

List 5.36


In [28]:
from matplotlib import pyplot as plt

figure = plt.figure(figsize=(8, 8))
ax1 = figure.add_subplot(111)

lego_df.plot.scatter(ax=ax1, x='pieces', y='us_price', grid=True, color='c')

max_piece = lego_df['pieces'].max()

x = [0, max_piece]
y = [get_price(0), get_price(max_piece)]

ax1.plot(x, y)

plt.show()



In [ ]: