In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas import Series,DataFrame
import seaborn as sns
In [2]:
input_data = pd.read_csv("winemag-data_first150k.csv")
In [32]:
input_data.head()
Out[32]:
In [33]:
input_data["points"].describe()
Out[33]:
In [34]:
input_data["country"].unique()
Out[34]:
In [35]:
input_data["country"].value_counts()
Out[35]:
In [36]:
input_data["price"].describe()
Out[36]:
In [38]:
#去重
(input_data.duplicated()).value_counts()
input_data = input_data.drop_duplicates()
(input_data.duplicated()).value_counts()
Out[38]:
In [41]:
input_data.columns
Out[41]:
In [39]:
# 去空
input_data["price"].isnull().sum()
Out[39]:
In [83]:
#df.fillna(value=0)
mean = input_data["price"].mean()
input_data["price_nonull"] = input_data["price"].fillna(mean)
print ("price",input_data["price"].isnull().sum())
print ("price_nonull",input_data["price_nonull"].isnull().sum())
In [84]:
#map
print (mean)
input_data["price_map_new"] = input_data["price_nonull"].map(lambda p: p-mean )
In [56]:
mean = input_data["price_nonull"].mean()
Out[56]:
In [85]:
#apply
mean = input_data["price_nonull"].mean()
def remean_points(srs):
srs["points_apply_new"] = srs.price_nonull - mean
return srs["points_apply_new"]
input_data["points_apply_new"] = input_data.apply(remean_points, axis='columns')
In [86]:
input_data.head()
Out[86]:
In [61]:
#效果一样
input_data["price_opera"] = input_data["price"]- mean
In [62]:
input_data.head()
Out[62]:
In [4]:
#input_data["variety"].isnull().sum()
input_data["country"].isnull().sum()
Out[4]:
In [5]:
output = input_data.loc[input_data.country.notnull()]
In [6]:
output.country.isnull().sum()
#output.variety.isnull().sum()
Out[6]:
In [7]:
output["country_variety"] = output.country + "-" + output.variety
In [8]:
output.head()
Out[8]:
In [13]:
output["description"].str.contains("blackberry").value_counts()
Out[13]:
In [14]:
output["blackberry"] = output["description"].str.contains("blackberry")
output.head()
Out[14]:
In [ ]:
#从description中拿相同描述的词出现的频率
print (mean)
input_data["price_map_new"] = input_data["price_nonull"].map(lambda p: p-mean )
In [ ]: