In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas import Series,DataFrame
import seaborn as sns

In [2]:
#https://www.kaggle.com/residentmario/bivariate-plotting-with-pandas/data
reviews = pd.read_csv("winemag-data_first150k.csv", index_col=0)
reviews.head()


Out[2]:
country description designation points price province region_1 region_2 variety winery
0 US This tremendous 100% varietal wine hails from ... Martha's Vineyard 96 235.0 California Napa Valley Napa Cabernet Sauvignon Heitz
1 Spain Ripe aromas of fig, blackberry and cassis are ... Carodorum Selección Especial Reserva 96 110.0 Northern Spain Toro NaN Tinta de Toro Bodega Carmen Rodríguez
2 US Mac Watson honors the memory of a wine once ma... Special Selected Late Harvest 96 90.0 California Knights Valley Sonoma Sauvignon Blanc Macauley
3 US This spent 20 months in 30% new French oak, an... Reserve 96 65.0 Oregon Willamette Valley Willamette Valley Pinot Noir Ponzi
4 France This is the top wine from La Bégude, named aft... La Brûlade 95 66.0 Provence Bandol NaN Provence red blend Domaine de la Bégude

Bivariate plotting with pandas

df.plot.scatter() df.plot.hex() df.plot.bar(stacked=True) df.plot.line()


In [3]:
# 价格和得分的关系 price  points  100个样本
reviews[reviews['price'] < 100].sample(100).plot.scatter(x='price', y='points')
plt.show()



In [4]:
reviews[reviews['price'] < 100].plot.scatter(x='price', y='points')
plt.show()



In [5]:
reviews[reviews['price'] < 100].plot.hexbin(x='price', y='points', gridsize=15)
plt.show()



In [39]:
count_data = reviews.groupby(['points']).mean()
#[['Williams Selyem', 'Testarossa', 'DFJ Vinhos', 'Chateau Ste. Michelle', 'Columbia Crest', 'Concha y Toro', 'Kendall-Jackson', 'Trapiche', 'Bouchard Père & Fils', 'Kenwood']]
count_data.plot.bar()
plt.show()



In [17]:
reviews.columns


Out[17]:
Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'variety', 'winery'],
      dtype='object')

In [36]:
cate = reviews['winery'].value_counts().head(10)
cate.plot.line()
plt.show()



In [27]:
count_data = reviews.groupby(['points']).mean()
#[cate]
count_data
#count_data.plot.bar(stacked=True)
#plt.show()


Out[27]:
price
points
80 17.224236
81 17.665078
82 19.171879
83 18.013604
84 19.367885
85 20.055067
86 21.816528
87 24.588279
88 27.831296
89 32.538375
90 37.357817
91 43.755835
92 52.303609
93 66.425438
94 81.937812
95 108.927012
96 132.350970
97 178.000000
98 232.131579
99 289.525000
100 401.583333

In [42]:
pokemon = pd.read_csv("Pokemon.csv", index_col=0)
pokemon.head()


Out[42]:
Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed Generation Legendary
#
1 Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False
2 Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False
3 Venusaur Grass Poison 525 80 82 83 100 100 80 1 False
3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False
4 Charmander Fire NaN 309 39 52 43 60 50 65 1 False

In [43]:
pokemon_stats_legendary = pokemon.groupby(['Legendary', 'Generation']).mean()[['Attack', 'Defense']]
pokemon_stats_legendary.plot.bar(stacked=True)
plt.show()



In [44]:
pokemon_stats_by_generation = pokemon.groupby('Generation').mean()[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]
pokemon_stats_by_generation.plot.line()
plt.show()



In [ ]: