In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas import Series,DataFrame
import seaborn as sns



In [2]:

    
input_data = pd.read_csv("winemag-data_first150k.csv")



In [32]:

    
input_data.head()









    Out[32]:







  
    
      
      Unnamed: 0
      country
      description
      designation
      points
      price
      province
      region_1
      region_2
      variety
      winery
    
  
  
    
      0
      0
      US
      This tremendous 100% varietal wine hails from ...
      Martha's Vineyard
      96
      235.0
      California
      Napa Valley
      Napa
      Cabernet Sauvignon
      Heitz
    
    
      1
      1
      Spain
      Ripe aromas of fig, blackberry and cassis are ...
      Carodorum Selección Especial Reserva
      96
      110.0
      Northern Spain
      Toro
      NaN
      Tinta de Toro
      Bodega Carmen Rodríguez
    
    
      2
      2
      US
      Mac Watson honors the memory of a wine once ma...
      Special Selected Late Harvest
      96
      90.0
      California
      Knights Valley
      Sonoma
      Sauvignon Blanc
      Macauley
    
    
      3
      3
      US
      This spent 20 months in 30% new French oak, an...
      Reserve
      96
      65.0
      Oregon
      Willamette Valley
      Willamette Valley
      Pinot Noir
      Ponzi
    
    
      4
      4
      France
      This is the top wine from La Bégude, named aft...
      La Brûlade
      95
      66.0
      Provence
      Bandol
      NaN
      Provence red blend
      Domaine de la Bégude

数量中位数平均值最大最小四分位数

类别数，各个类别的数量



In [33]:

    
input_data["points"].describe()









    Out[33]:





count    150930.000000
mean         87.888418
std           3.222392
min          80.000000
25%          86.000000
50%          88.000000
75%          90.000000
max         100.000000
Name: points, dtype: float64



In [34]:

    
input_data["country"].unique()









    Out[34]:





array(['US', 'Spain', 'France', 'Italy', 'New Zealand', 'Bulgaria',
       'Argentina', 'Australia', 'Portugal', 'Israel', 'South Africa',
       'Greece', 'Chile', 'Morocco', 'Romania', 'Germany', 'Canada',
       'Moldova', 'Hungary', 'Austria', 'Croatia', 'Slovenia', nan,
       'India', 'Turkey', 'Macedonia', 'Lebanon', 'Serbia', 'Uruguay',
       'Switzerland', 'Albania', 'Bosnia and Herzegovina', 'Brazil',
       'Cyprus', 'Lithuania', 'Japan', 'China', 'South Korea', 'Ukraine',
       'England', 'Mexico', 'Georgia', 'Montenegro', 'Luxembourg',
       'Slovakia', 'Czech Republic', 'Egypt', 'Tunisia', 'US-France'], dtype=object)



In [35]:

    
input_data["country"].value_counts()









    Out[35]:





US                        62397
Italy                     23478
France                    21098
Spain                      8268
Chile                      5816
Argentina                  5631
Portugal                   5322
Australia                  4957
New Zealand                3320
Austria                    3057
Germany                    2452
South Africa               2258
Greece                      884
Israel                      630
Hungary                     231
Canada                      196
Romania                     139
Slovenia                     94
Uruguay                      92
Croatia                      89
Bulgaria                     77
Moldova                      71
Mexico                       63
Turkey                       52
Georgia                      43
Lebanon                      37
Cyprus                       31
Brazil                       25
Macedonia                    16
Serbia                       14
Morocco                      12
Luxembourg                    9
England                       9
India                         8
Lithuania                     8
Czech Republic                6
Ukraine                       5
Bosnia and Herzegovina        4
Switzerland                   4
South Korea                   4
Slovakia                      3
Egypt                         3
China                         3
Albania                       2
Japan                         2
Tunisia                       2
Montenegro                    2
US-France                     1
Name: country, dtype: int64

查重查空填充mean，改成相对值



In [36]:

    
input_data["price"].describe()









    Out[36]:





count    137235.000000
mean         33.131482
std          36.322536
min           4.000000
25%          16.000000
50%          24.000000
75%          40.000000
max        2300.000000
Name: price, dtype: float64



In [38]:

    
#去重
(input_data.duplicated()).value_counts()
input_data = input_data.drop_duplicates()
(input_data.duplicated()).value_counts()









    Out[38]:





False    150930
dtype: int64



In [41]:

    
input_data.columns









    Out[41]:





Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'variety', 'winery'],
      dtype='object')



In [39]:

    
# 去空
input_data["price"].isnull().sum()









    Out[39]:





13695



In [83]:

    
#df.fillna(value=0)
mean = input_data["price"].mean()
input_data["price_nonull"] = input_data["price"].fillna(mean)
print ("price",input_data["price"].isnull().sum())
print ("price_nonull",input_data["price_nonull"].isnull().sum())









    



price 13695
price_nonull 0

map apply修改数据,构造新的数据列等



In [84]:

    
#map
print (mean)
input_data["price_map_new"] = input_data["price_nonull"].map(lambda p: p-mean )









    



33.13148249353299



In [56]:

    
mean = input_data["price_nonull"].mean()









    Out[56]:





33.13148249353165



In [85]:

    
#apply
mean = input_data["price_nonull"].mean()
def remean_points(srs):
    srs["points_apply_new"] = srs.price_nonull - mean
    return srs["points_apply_new"]
input_data["points_apply_new"] = input_data.apply(remean_points, axis='columns')



In [86]:

    
input_data.head()









    Out[86]:







  
    
      
      Unnamed: 0
      country
      description
      designation
      points
      price
      province
      region_1
      region_2
      variety
      winery
      price_nonull
      price_map_new
      points_apply_new
    
  
  
    
      0
      0
      US
      This tremendous 100% varietal wine hails from ...
      Martha's Vineyard
      96
      235.0
      California
      Napa Valley
      Napa
      Cabernet Sauvignon
      Heitz
      235.0
      201.868518
      201.868518
    
    
      1
      1
      Spain
      Ripe aromas of fig, blackberry and cassis are ...
      Carodorum Selección Especial Reserva
      96
      110.0
      Northern Spain
      Toro
      NaN
      Tinta de Toro
      Bodega Carmen Rodríguez
      110.0
      76.868518
      76.868518
    
    
      2
      2
      US
      Mac Watson honors the memory of a wine once ma...
      Special Selected Late Harvest
      96
      90.0
      California
      Knights Valley
      Sonoma
      Sauvignon Blanc
      Macauley
      90.0
      56.868518
      56.868518
    
    
      3
      3
      US
      This spent 20 months in 30% new French oak, an...
      Reserve
      96
      65.0
      Oregon
      Willamette Valley
      Willamette Valley
      Pinot Noir
      Ponzi
      65.0
      31.868518
      31.868518
    
    
      4
      4
      France
      This is the top wine from La Bégude, named aft...
      La Brûlade
      95
      66.0
      Provence
      Bandol
      NaN
      Provence red blend
      Domaine de la Bégude
      66.0
      32.868518
      32.868518

map apply 效率比较低，以为是一个一个的循环操作，相对于直接的“+”“-”等运算符比较慢，但比这些灵活，效果一样，合并，创建新的数据列



In [61]:

    
#效果一样
input_data["price_opera"] = input_data["price"]- mean



In [62]:

    
input_data.head()









    Out[62]:







  
    
      
      Unnamed: 0
      country
      description
      designation
      points
      price
      province
      region_1
      region_2
      variety
      winery
      price_nonull
      price_new
      price_opera
    
  
  
    
      0
      0
      US
      This tremendous 100% varietal wine hails from ...
      Martha's Vineyard
      96
      235.0
      California
      Napa Valley
      Napa
      Cabernet Sauvignon
      Heitz
      235.0
      201.868518
      201.868518
    
    
      1
      1
      Spain
      Ripe aromas of fig, blackberry and cassis are ...
      Carodorum Selección Especial Reserva
      96
      110.0
      Northern Spain
      Toro
      NaN
      Tinta de Toro
      Bodega Carmen Rodríguez
      110.0
      76.868518
      76.868518
    
    
      2
      2
      US
      Mac Watson honors the memory of a wine once ma...
      Special Selected Late Harvest
      96
      90.0
      California
      Knights Valley
      Sonoma
      Sauvignon Blanc
      Macauley
      90.0
      56.868518
      56.868518
    
    
      3
      3
      US
      This spent 20 months in 30% new French oak, an...
      Reserve
      96
      65.0
      Oregon
      Willamette Valley
      Willamette Valley
      Pinot Noir
      Ponzi
      65.0
      31.868518
      31.868518
    
    
      4
      4
      France
      This is the top wine from La Bégude, named aft...
      La Brûlade
      95
      66.0
      Provence
      Bandol
      NaN
      Provence red blend
      Domaine de la Bégude
      66.0
      32.868518
      32.868518



In [4]:

    
#input_data["variety"].isnull().sum()
input_data["country"].isnull().sum()









    Out[4]:





5



In [5]:

    
output = input_data.loc[input_data.country.notnull()]



In [6]:

    
output.country.isnull().sum()
#output.variety.isnull().sum()









    Out[6]:





0



In [7]:

    
output["country_variety"] = output.country + "-" + output.variety









    



C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.



In [8]:

    
output.head()









    Out[8]:







  
    
      
      Unnamed: 0
      country
      description
      designation
      points
      price
      province
      region_1
      region_2
      variety
      winery
      country_variety
    
  
  
    
      0
      0
      US
      This tremendous 100% varietal wine hails from ...
      Martha's Vineyard
      96
      235.0
      California
      Napa Valley
      Napa
      Cabernet Sauvignon
      Heitz
      US-Cabernet Sauvignon
    
    
      1
      1
      Spain
      Ripe aromas of fig, blackberry and cassis are ...
      Carodorum Selección Especial Reserva
      96
      110.0
      Northern Spain
      Toro
      NaN
      Tinta de Toro
      Bodega Carmen Rodríguez
      Spain-Tinta de Toro
    
    
      2
      2
      US
      Mac Watson honors the memory of a wine once ma...
      Special Selected Late Harvest
      96
      90.0
      California
      Knights Valley
      Sonoma
      Sauvignon Blanc
      Macauley
      US-Sauvignon Blanc
    
    
      3
      3
      US
      This spent 20 months in 30% new French oak, an...
      Reserve
      96
      65.0
      Oregon
      Willamette Valley
      Willamette Valley
      Pinot Noir
      Ponzi
      US-Pinot Noir
    
    
      4
      4
      France
      This is the top wine from La Bégude, named aft...
      La Brûlade
      95
      66.0
      Provence
      Bandol
      NaN
      Provence red blend
      Domaine de la Bégude
      France-Provence red blend



In [13]:

    
output["description"].str.contains("blackberry").value_counts()









    Out[13]:





False    136407
True      14518
Name: description, dtype: int64



In [14]:

    
output["blackberry"] = output["description"].str.contains("blackberry")
output.head()









    



C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.






    Out[14]:







  
    
      
      Unnamed: 0
      country
      description
      designation
      points
      price
      province
      region_1
      region_2
      variety
      winery
      country_variety
      blackberry
    
  
  
    
      0
      0
      US
      This tremendous 100% varietal wine hails from ...
      Martha's Vineyard
      96
      235.0
      California
      Napa Valley
      Napa
      Cabernet Sauvignon
      Heitz
      US-Cabernet Sauvignon
      False
    
    
      1
      1
      Spain
      Ripe aromas of fig, blackberry and cassis are ...
      Carodorum Selección Especial Reserva
      96
      110.0
      Northern Spain
      Toro
      NaN
      Tinta de Toro
      Bodega Carmen Rodríguez
      Spain-Tinta de Toro
      True
    
    
      2
      2
      US
      Mac Watson honors the memory of a wine once ma...
      Special Selected Late Harvest
      96
      90.0
      California
      Knights Valley
      Sonoma
      Sauvignon Blanc
      Macauley
      US-Sauvignon Blanc
      False
    
    
      3
      3
      US
      This spent 20 months in 30% new French oak, an...
      Reserve
      96
      65.0
      Oregon
      Willamette Valley
      Willamette Valley
      Pinot Noir
      Ponzi
      US-Pinot Noir
      True
    
    
      4
      4
      France
      This is the top wine from La Bégude, named aft...
      La Brûlade
      95
      66.0
      Provence
      Bandol
      NaN
      Provence red blend
      Domaine de la Bégude
      France-Provence red blend
      False



In [ ]:

    
#从description中拿相同描述的词出现的频率


print (mean)
input_data["price_map_new"] = input_data["price_nonull"].map(lambda p: p-mean )



In [ ]:

	Unnamed: 0	country	description	designation	points	price	province	region_1	region_2	variety	winery
0	0	US	This tremendous 100% varietal wine hails from ...	Martha's Vineyard	96	235.0	California	Napa Valley	Napa	Cabernet Sauvignon	Heitz
1	1	Spain	Ripe aromas of fig, blackberry and cassis are ...	Carodorum Selección Especial Reserva	96	110.0	Northern Spain	Toro	NaN	Tinta de Toro	Bodega Carmen Rodríguez
2	2	US	Mac Watson honors the memory of a wine once ma...	Special Selected Late Harvest	96	90.0	California	Knights Valley	Sonoma	Sauvignon Blanc	Macauley
3	3	US	This spent 20 months in 30% new French oak, an...	Reserve	96	65.0	Oregon	Willamette Valley	Willamette Valley	Pinot Noir	Ponzi
4	4	France	This is the top wine from La Bégude, named aft...	La Brûlade	95	66.0	Provence	Bandol	NaN	Provence red blend	Domaine de la Bégude

数量 中位数 平均值 最大最小四分位数

类别数，各个类别的数量

查重查空 填充mean，改成相对值

map apply修改数据,构造新的数据列等

map apply 效率比较低，以为是一个一个的循环操作，相对于直接的“+”“-”等运算符比较慢，但比这些灵活，效果一样，合并，创建新的数据列

数量中位数平均值最大最小四分位数

查重查空填充mean，改成相对值