In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas import Series,DataFrame
import seaborn as sns

In [2]:
input_data = pd.read_csv("winemag-data_first150k.csv")

In [32]:
input_data.head()


Out[32]:
Unnamed: 0 country description designation points price province region_1 region_2 variety winery
0 0 US This tremendous 100% varietal wine hails from ... Martha's Vineyard 96 235.0 California Napa Valley Napa Cabernet Sauvignon Heitz
1 1 Spain Ripe aromas of fig, blackberry and cassis are ... Carodorum Selección Especial Reserva 96 110.0 Northern Spain Toro NaN Tinta de Toro Bodega Carmen Rodríguez
2 2 US Mac Watson honors the memory of a wine once ma... Special Selected Late Harvest 96 90.0 California Knights Valley Sonoma Sauvignon Blanc Macauley
3 3 US This spent 20 months in 30% new French oak, an... Reserve 96 65.0 Oregon Willamette Valley Willamette Valley Pinot Noir Ponzi
4 4 France This is the top wine from La Bégude, named aft... La Brûlade 95 66.0 Provence Bandol NaN Provence red blend Domaine de la Bégude

数量 中位数 平均值 最大最小四分位数

类别数,各个类别的数量


In [33]:
input_data["points"].describe()


Out[33]:
count    150930.000000
mean         87.888418
std           3.222392
min          80.000000
25%          86.000000
50%          88.000000
75%          90.000000
max         100.000000
Name: points, dtype: float64

In [34]:
input_data["country"].unique()


Out[34]:
array(['US', 'Spain', 'France', 'Italy', 'New Zealand', 'Bulgaria',
       'Argentina', 'Australia', 'Portugal', 'Israel', 'South Africa',
       'Greece', 'Chile', 'Morocco', 'Romania', 'Germany', 'Canada',
       'Moldova', 'Hungary', 'Austria', 'Croatia', 'Slovenia', nan,
       'India', 'Turkey', 'Macedonia', 'Lebanon', 'Serbia', 'Uruguay',
       'Switzerland', 'Albania', 'Bosnia and Herzegovina', 'Brazil',
       'Cyprus', 'Lithuania', 'Japan', 'China', 'South Korea', 'Ukraine',
       'England', 'Mexico', 'Georgia', 'Montenegro', 'Luxembourg',
       'Slovakia', 'Czech Republic', 'Egypt', 'Tunisia', 'US-France'], dtype=object)

In [35]:
input_data["country"].value_counts()


Out[35]:
US                        62397
Italy                     23478
France                    21098
Spain                      8268
Chile                      5816
Argentina                  5631
Portugal                   5322
Australia                  4957
New Zealand                3320
Austria                    3057
Germany                    2452
South Africa               2258
Greece                      884
Israel                      630
Hungary                     231
Canada                      196
Romania                     139
Slovenia                     94
Uruguay                      92
Croatia                      89
Bulgaria                     77
Moldova                      71
Mexico                       63
Turkey                       52
Georgia                      43
Lebanon                      37
Cyprus                       31
Brazil                       25
Macedonia                    16
Serbia                       14
Morocco                      12
Luxembourg                    9
England                       9
India                         8
Lithuania                     8
Czech Republic                6
Ukraine                       5
Bosnia and Herzegovina        4
Switzerland                   4
South Korea                   4
Slovakia                      3
Egypt                         3
China                         3
Albania                       2
Japan                         2
Tunisia                       2
Montenegro                    2
US-France                     1
Name: country, dtype: int64

查重查空 填充mean,改成相对值


In [36]:
input_data["price"].describe()


Out[36]:
count    137235.000000
mean         33.131482
std          36.322536
min           4.000000
25%          16.000000
50%          24.000000
75%          40.000000
max        2300.000000
Name: price, dtype: float64

In [38]:
#去重
(input_data.duplicated()).value_counts()
input_data = input_data.drop_duplicates()
(input_data.duplicated()).value_counts()


Out[38]:
False    150930
dtype: int64

In [41]:
input_data.columns


Out[41]:
Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'variety', 'winery'],
      dtype='object')

In [39]:
# 去空
input_data["price"].isnull().sum()


Out[39]:
13695

In [83]:
#df.fillna(value=0)
mean = input_data["price"].mean()
input_data["price_nonull"] = input_data["price"].fillna(mean)
print ("price",input_data["price"].isnull().sum())
print ("price_nonull",input_data["price_nonull"].isnull().sum())


price 13695
price_nonull 0

map apply修改数据,构造新的数据列等


In [84]:
#map
print (mean)
input_data["price_map_new"] = input_data["price_nonull"].map(lambda p: p-mean )


33.13148249353299

In [56]:
mean = input_data["price_nonull"].mean()


Out[56]:
33.13148249353165

In [85]:
#apply
mean = input_data["price_nonull"].mean()
def remean_points(srs):
    srs["points_apply_new"] = srs.price_nonull - mean
    return srs["points_apply_new"]
input_data["points_apply_new"] = input_data.apply(remean_points, axis='columns')

In [86]:
input_data.head()


Out[86]:
Unnamed: 0 country description designation points price province region_1 region_2 variety winery price_nonull price_map_new points_apply_new
0 0 US This tremendous 100% varietal wine hails from ... Martha's Vineyard 96 235.0 California Napa Valley Napa Cabernet Sauvignon Heitz 235.0 201.868518 201.868518
1 1 Spain Ripe aromas of fig, blackberry and cassis are ... Carodorum Selección Especial Reserva 96 110.0 Northern Spain Toro NaN Tinta de Toro Bodega Carmen Rodríguez 110.0 76.868518 76.868518
2 2 US Mac Watson honors the memory of a wine once ma... Special Selected Late Harvest 96 90.0 California Knights Valley Sonoma Sauvignon Blanc Macauley 90.0 56.868518 56.868518
3 3 US This spent 20 months in 30% new French oak, an... Reserve 96 65.0 Oregon Willamette Valley Willamette Valley Pinot Noir Ponzi 65.0 31.868518 31.868518
4 4 France This is the top wine from La Bégude, named aft... La Brûlade 95 66.0 Provence Bandol NaN Provence red blend Domaine de la Bégude 66.0 32.868518 32.868518

map apply 效率比较低,以为是一个一个的循环操作,相对于直接的“+”“-”等运算符比较慢,但比这些灵活,效果一样,合并,创建新的数据列


In [61]:
#效果一样
input_data["price_opera"] = input_data["price"]- mean

In [62]:
input_data.head()


Out[62]:
Unnamed: 0 country description designation points price province region_1 region_2 variety winery price_nonull price_new price_opera
0 0 US This tremendous 100% varietal wine hails from ... Martha's Vineyard 96 235.0 California Napa Valley Napa Cabernet Sauvignon Heitz 235.0 201.868518 201.868518
1 1 Spain Ripe aromas of fig, blackberry and cassis are ... Carodorum Selección Especial Reserva 96 110.0 Northern Spain Toro NaN Tinta de Toro Bodega Carmen Rodríguez 110.0 76.868518 76.868518
2 2 US Mac Watson honors the memory of a wine once ma... Special Selected Late Harvest 96 90.0 California Knights Valley Sonoma Sauvignon Blanc Macauley 90.0 56.868518 56.868518
3 3 US This spent 20 months in 30% new French oak, an... Reserve 96 65.0 Oregon Willamette Valley Willamette Valley Pinot Noir Ponzi 65.0 31.868518 31.868518
4 4 France This is the top wine from La Bégude, named aft... La Brûlade 95 66.0 Provence Bandol NaN Provence red blend Domaine de la Bégude 66.0 32.868518 32.868518

In [4]:
#input_data["variety"].isnull().sum()
input_data["country"].isnull().sum()


Out[4]:
5

In [5]:
output = input_data.loc[input_data.country.notnull()]

In [6]:
output.country.isnull().sum()
#output.variety.isnull().sum()


Out[6]:
0

In [7]:
output["country_variety"] = output.country + "-" + output.variety


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [8]:
output.head()


Out[8]:
Unnamed: 0 country description designation points price province region_1 region_2 variety winery country_variety
0 0 US This tremendous 100% varietal wine hails from ... Martha's Vineyard 96 235.0 California Napa Valley Napa Cabernet Sauvignon Heitz US-Cabernet Sauvignon
1 1 Spain Ripe aromas of fig, blackberry and cassis are ... Carodorum Selección Especial Reserva 96 110.0 Northern Spain Toro NaN Tinta de Toro Bodega Carmen Rodríguez Spain-Tinta de Toro
2 2 US Mac Watson honors the memory of a wine once ma... Special Selected Late Harvest 96 90.0 California Knights Valley Sonoma Sauvignon Blanc Macauley US-Sauvignon Blanc
3 3 US This spent 20 months in 30% new French oak, an... Reserve 96 65.0 Oregon Willamette Valley Willamette Valley Pinot Noir Ponzi US-Pinot Noir
4 4 France This is the top wine from La Bégude, named aft... La Brûlade 95 66.0 Provence Bandol NaN Provence red blend Domaine de la Bégude France-Provence red blend

In [13]:
output["description"].str.contains("blackberry").value_counts()


Out[13]:
False    136407
True      14518
Name: description, dtype: int64

In [14]:
output["blackberry"] = output["description"].str.contains("blackberry")
output.head()


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
Out[14]:
Unnamed: 0 country description designation points price province region_1 region_2 variety winery country_variety blackberry
0 0 US This tremendous 100% varietal wine hails from ... Martha's Vineyard 96 235.0 California Napa Valley Napa Cabernet Sauvignon Heitz US-Cabernet Sauvignon False
1 1 Spain Ripe aromas of fig, blackberry and cassis are ... Carodorum Selección Especial Reserva 96 110.0 Northern Spain Toro NaN Tinta de Toro Bodega Carmen Rodríguez Spain-Tinta de Toro True
2 2 US Mac Watson honors the memory of a wine once ma... Special Selected Late Harvest 96 90.0 California Knights Valley Sonoma Sauvignon Blanc Macauley US-Sauvignon Blanc False
3 3 US This spent 20 months in 30% new French oak, an... Reserve 96 65.0 Oregon Willamette Valley Willamette Valley Pinot Noir Ponzi US-Pinot Noir True
4 4 France This is the top wine from La Bégude, named aft... La Brûlade 95 66.0 Provence Bandol NaN Provence red blend Domaine de la Bégude France-Provence red blend False

In [ ]:
#从description中拿相同描述的词出现的频率


print (mean)
input_data["price_map_new"] = input_data["price_nonull"].map(lambda p: p-mean )

In [ ]: