"Data is messy"
We will be performing the following operation on our Onion price to refine it
Other stuff you may need to do to refine are...
In [1]:
# Import the two library we need, which is Pandas and Numpy
import pandas as pd
import numpy as np
In [2]:
# Read the csv file of Month Wise Market Arrival data that has been scraped.
df = pd.read_csv('MonthWiseMarketArrivals.csv')
In [3]:
df.head()
Out[3]:
In [4]:
df.tail()
Out[4]:
In [5]:
df.dtypes
Out[5]:
In [6]:
# Delete the last row from the dataframe
df.tail(1)
Out[6]:
In [7]:
# Delete a row from the dataframe
df.drop(df.tail(1).index, inplace = True)
In [8]:
df.head()
Out[8]:
In [56]:
df.tail()
Out[56]:
In [57]:
df.dtypes
Out[57]:
In [58]:
df.iloc[:,4:7].head()
Out[58]:
In [59]:
df.iloc[:,2:7] = df.iloc[:,2:7].astype(int)
In [60]:
df.dtypes
Out[60]:
In [61]:
df.head()
Out[61]:
In [62]:
df.describe()
Out[62]:
In [63]:
df.market.value_counts().head()
Out[63]:
In [64]:
df['state'] = df.market.str.split('(').str[-1]
In [65]:
df.head()
Out[65]:
In [66]:
df['city'] = df.market.str.split('(').str[0]
In [67]:
df.head()
Out[67]:
In [68]:
df.state.unique()
Out[68]:
In [69]:
df['state'] = df.state.str.split(')').str[0]
In [70]:
df.state.unique()
Out[70]:
In [71]:
dfState = df.groupby(['state', 'market'], as_index=False).count()
In [72]:
dfState.market.unique()
Out[72]:
In [73]:
state_now = ['PB', 'UP', 'GUJ', 'MS', 'RAJ', 'BANGALORE', 'KNT', 'BHOPAL', 'OR',
'BHR', 'WB', 'CHANDIGARH', 'CHENNAI', 'bellary', 'podisu', 'UTT',
'DELHI', 'MP', 'TN', 'Podis', 'GUWAHATI', 'HYDERABAD', 'JAIPUR',
'WHITE', 'JAMMU', 'HR', 'KOLKATA', 'AP', 'LUCKNOW', 'MUMBAI',
'NAGPUR', 'KER', 'PATNA', 'CHGARH', 'JH', 'SHIMLA', 'SRINAGAR',
'TRIVENDRUM']
In [74]:
state_new =['PB', 'UP', 'GUJ', 'MS', 'RAJ', 'KNT', 'KNT', 'MP', 'OR',
'BHR', 'WB', 'CH', 'TN', 'KNT', 'TN', 'UP',
'DEL', 'MP', 'TN', 'TN', 'ASM', 'AP', 'RAJ',
'MS', 'JK', 'HR', 'WB', 'AP', 'UP', 'MS',
'MS', 'KER', 'BHR', 'HR', 'JH', 'HP', 'JK',
'KEL']
In [75]:
df.state = df.state.replace(state_now, state_new)
In [76]:
df.state.unique()
Out[76]:
In [77]:
df.head()
Out[77]:
In [78]:
df.index
Out[78]:
In [79]:
pd.to_datetime('January 2012')
Out[79]:
In [80]:
df['date'] = df['month'] + '-' + df['year'].map(str)
In [82]:
??map
In [81]:
df.head()
Out[81]:
In [85]:
index = pd.to_datetime(df.date)
In [86]:
df.index = pd.PeriodIndex(df.date, freq='M')
In [ ]:
df.columns
In [87]:
df.index
Out[87]:
In [88]:
df.head()
Out[88]:
In [ ]:
df.to_csv('MonthWiseMarketArrivals_Clean.csv', index = False)