"Data is messy"
We will be performing the following operation on our Onion price to refine it
Other stuff you may need to do to refine are...
In [ ]:
# Import the two library we need, which is Pandas and Numpy
import pandas as pd
import numpy as np
In [ ]:
# Read the csv file of Month Wise Market Arrival data that has been scraped.
df = pd.read_csv('MonthWiseMarketArrivals.csv')
In [ ]:
df.head()
In [ ]:
df.tail()
In [ ]:
df.dtypes
In [ ]:
# Delete the last row from the dataframe
df.tail(1)
In [ ]:
# Delete a row from the dataframe
df.drop(df.tail(1).index, inplace = True)
In [ ]:
df.tail()
In [ ]:
df.dtypes
In [ ]:
df.iloc[:,4:7].head()
In [ ]:
df.iloc[:,2:7] = df.iloc[:,2:7].astype(int)
In [ ]:
df.dtypes
In [ ]:
df.head()
In [ ]:
df.describe()
In [ ]:
df.market.value_counts().head()
In [ ]:
df['state'] = df.market.str.split('(').str[-1]
In [ ]:
df.head()
In [ ]:
df['city'] = df.market.str.split('(').str[0]
In [ ]:
df.head()
In [ ]:
df.state.unique()
In [ ]:
df['state'] = df.state.str.split(')').str[0]
In [ ]:
df.state.unique()
In [ ]:
dfState = df.groupby(['state', 'market'], as_index=False).count()
In [ ]:
dfState.market.unique()
In [ ]:
state_now = ['PB', 'UP', 'GUJ', 'MS', 'RAJ', 'BANGALORE', 'KNT', 'BHOPAL', 'OR',
'BHR', 'WB', 'CHANDIGARH', 'CHENNAI', 'bellary', 'podisu', 'UTT',
'DELHI', 'MP', 'TN', 'Podis', 'GUWAHATI', 'HYDERABAD', 'JAIPUR',
'WHITE', 'JAMMU', 'HR', 'KOLKATA', 'AP', 'LUCKNOW', 'MUMBAI',
'NAGPUR', 'KER', 'PATNA', 'CHGARH', 'JH', 'SHIMLA', 'SRINAGAR',
'TRIVENDRUM']
In [ ]:
state_new =['PB', 'UP', 'GUJ', 'MS', 'RAJ', 'KNT', 'KNT', 'MP', 'OR',
'BHR', 'WB', 'CH', 'TN', 'KNT', 'TN', 'UP',
'DEL', 'MP', 'TN', 'TN', 'ASM', 'AP', 'RAJ',
'MS', 'JK', 'HR', 'WB', 'AP', 'UP', 'MS',
'MS', 'KER', 'BHR', 'HR', 'JH', 'HP', 'JK',
'KEL']
In [ ]:
df.state = df.state.replace(state_now, state_new)
In [ ]:
df.state.unique()
In [ ]:
df.index
In [ ]:
pd.to_datetime('January 2012')
In [ ]:
df['date'] = df['month'] + '-' + df['year'].map(str)
In [ ]:
df.head()
In [ ]:
index = pd.to_datetime(df.date)
In [ ]:
df.index = pd.PeriodIndex(df.date, freq='M')
In [ ]:
df.columns
In [ ]:
df.index
In [ ]:
df.head()
In [ ]:
df.to_csv('MonthWiseMarketArrivals_Clean.csv', index = False)