Data Cleaning



In [28]:
import pandas as pd
import numpy as np

In [3]:
rice = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/BTP/Ricex_prepared2.csv")
rice.head()


Out[3]:
Unnamed: 0 State_Name ind_district Crop_Year Season Crop Area Production Value X1 ... 11_B10_Mn 12_B1_Mn 12_B2_Mn 12_B3_Mn 12_B4_Mn 12_B5_Mn 12_B6_Mn 12_B7_Mn 12_B9_Mn 12_B10_Mn
0 1 Chandigarh chandigarh 2005 kharif Rice 50 250 5.00 400 ... 10.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0
1 3 Chandigarh chandigarh 2007 kharif Rice 50 250 5.00 250 ... 10.0 10.0 10.0 10.0 10.0 10.0 10.0 10.0 10.0 10.0
2 4 Chandigarh chandigarh 2008 kharif Rice 20 100 5.00 250 ... 10.0 10.0 10.0 10.0 10.0 10.0 10.0 10.0 10.0 10.0
3 5 Chandigarh chandigarh 2009 kharif Rice 20 100 5.00 100 ... 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0
4 6 Chandigarh chandigarh 2010 kharif Rice 20 103 5.15 100 ... 10.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0

5 rows × 335 columns


In [4]:
rice = rice.drop(["Unnamed: 0"],axis=1)

334 = 10 + 216 + 108


In [13]:
cols = list(rice.columns.values)

Filling all the empty places with mean of the values for the same band values over all the months in that particular year.


In [26]:
l = rice.shape[0]
b = rice.shape[1]
for row in range(0,l):
    vals = np.zeros(18)
    bx = False
    for col in range(10,b-108,18):
        if pd.isnull(rice.iloc[row,col]):
            s = cols[col]
            #print s
            #print rice.iloc[row,col]
            
            """First calculate vals, if needed""" 
            if not bx: 
                n = 0
                for i in range(10,b-108,18):
                    if (cols[i] != s and (not pd.isnull(rice.iloc[row,i]))):
                        n += 1
                        for j in range(i,i+18):
                            vals[j-i] += rice.iloc[row,j] 
                vals = vals/n 
                bx = True
                
            """Update all those 18 blanks using vals"""
            for j in range(col,col+18):
                rice.iloc[row,j] = vals[j-col]

In [27]:
rice.to_csv("Rice_Ready.csv")