notebook.community

Edit and run



In [10]:

    
#importing pandas package
import pandas as pd
import numpy as np
from pandas import datetime
#Importing the classification_report,confusion_matrix from sklearn.metrics package
from sklearn.metrics import classification_report,confusion_matrix

path_to_projectData = 'Data/' #input path where the data of the project resides
train_data_df = pd.read_csv(path_to_projectData + 'training_data_2_csv_UTF.csv') #reading the bots_data.csv file

#Selecting few columns from the train dataframe
train_data= train_data_df[['followers_count','friends_count','listedcount','favourites_count','verified','statuses_count'
                           ,'screen_name','description','created_at','location']]


test_data_df= pd.read_csv(path_to_projectData + 'test_data_4_students.csv')
test_data_df.head(10)

#Selecting non NaN rows from the test dataset
test_data_df=test_data_df.head(575)
#Selecting few columns from the test dataframe
test_data=test_data_df[['followers_count','friends_count','listed_count','favorites_count','verified','statuses_count',
                        'screen_name','description','created_at','location']]

train_data_df
X_new_train=train_data



In [11]:

    
#Preprocessing and cleaning of training data

train_data['followers_count']=train_data['followers_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['friends_count']=train_data['friends_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['listedcount']=train_data['listedcount'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['favourites_count']=train_data['favourites_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['verified']=train_data['verified'].apply(lambda x: 0 if x==False else 1 if x==True else -1)
train_data['description']=train_data['description'].apply(lambda x: "" if type(x)==float else "" if x=='NaN'
                                                          else "" if x=='None' else x)
train_data['is_bot_screenname']=0
train_data['is_bot_screenname'] = train_data['screen_name'].apply(lambda x:1 if 'bot' in x  else 1 if 'Bot' in x  else 1 
                                                                   if 'BOT' in x else 0)
train_data['is_bot_description']=0
train_data['is_bot_description'] = train_data['description'].apply(lambda x:1 if 'bot' in x  else 1 if 'Bot' in x  else 1 
                                                                   if 'BOT' in x else 0)

train_data['location']=train_data['location'].apply(lambda x: "" if type(x)==float else "" if x=='NaN'
                                                          else "" if x=='None' else x)

train_data['is_bot_location'] = 0
train_data['is_bot_location'] = train_data['location'].apply(lambda x:0 if x=="" else len(x))


#Duplicating the train_data frame into new DataFrame
train_data_new = train_data.copy()

#Check whether there are null values in 'created_at' column name
pd.isnull(train_data_new['created_at']).values.any()


#Stripping the quotes across the created_at column
train_data_new['created_at'] = train_data_new['created_at'].map(lambda x: x.lstrip('"').rstrip('"'))

#Converting the created_at column to pandas datetime format
train_data_new['created_at'] = pd.to_datetime(train_data_new['created_at'])

#Creating new column called age to calculate the Age of the profile 
train_data_new['age']=0

time_stamp_now= datetime.now()

#Calculating the age of the profile by subtracting it with the current date 
train_data_new['age']=time_stamp_now.date()-train_data_new['created_at']

#Converting the age of profile to days i.e in numeric format
train_data_new['age']=train_data_new['age'].astype('timedelta64[D]')
train_data['age']=train_data_new['age']


#train_data['friends_by_followers'] = train_data.friends_count/train_data.followers_count
#Dropping the created_at column from train_data Frame
train_data.drop('created_at',axis=1, inplace=True)
train_data.drop('description', axis=1, inplace=True)
train_data.drop('screen_name', axis=1, inplace=True)
train_data.drop('location', axis=1, inplace=True)




X_train = train_data
Y_train= train_data_df['bot']









    



/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:47: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:52: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:53: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:54: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:55: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [12]:

    
#Preprocessing and cleaning of test data

test_data['followers_count']=test_data['followers_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['friends_count']=test_data['friends_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['listed_count']=test_data['listed_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['favorites_count']=test_data['favorites_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['verified']=test_data['verified'].apply(lambda x: 0 if x=='FALSE' else 1 if x=='TRUE' else -1)
test_data['is_bot_screenname']=0
test_data['is_bot_screenname'] = test_data['screen_name'].apply(lambda x:len(x) if 'bot' in x  else len(x) if 'Bot' in x  else len(x) 
                                                                   if 'BOT' in x else 0)

test_data['is_bot_description']=0
test_data['description']=test_data['description'].apply(lambda x: "" if type(x)==float else "" if x=='NaN' else ""
                                                        if x=='None' else x)

test_data['is_bot_description'] = test_data['description'].apply(lambda x:len(x) if 'bot' in x  else len(x) if 'Bot' in x  else len(x) 
                                                                   if 'BOT' in x else 0)



test_data['location']=test_data['location'].apply(lambda x: "" if type(x)==float else "" if x=='NaN'
                                                          else "" if x=='None' else x)
test_data['is_bot_location'] = 0
test_data['is_bot_location'] = test_data['location'].apply(lambda x:0 if x=="" else len(x))


#Stripping the quotes across the created_at column
test_data['created_at'] = test_data['created_at'].map(lambda x: x.lstrip('"').rstrip('"'))

#Converting the created_at column to pandas datetime format
test_data['created_at'] = pd.to_datetime(test_data['created_at'])

test_data_new = test_data.copy()
#Creating new column called age to calculate the Age of the profile 
test_data_new['age']=0

#test_data['friends_by_followers'] = test_data.friends_count/test_data.followers_count


time_stamp_now= datetime.now()


#Calculating the age of the profile by subtracting it with the current date
test_data_new['age']=time_stamp_now.date()-test_data_new['created_at']

#Converting the age of profile to days i.e in numeric format
test_data_new['age']=test_data_new['age'].astype('timedelta64[D]')
test_data['age']=test_data_new['age']

#Dropping the created_at column from train_data Frame
test_data.drop('created_at',axis=1, inplace=True)



test_data['followers_count'].fillna(0,inplace=True)
test_data['friends_count'].fillna(0,inplace=True)
test_data['listed_count'].fillna(0,inplace=True)
test_data['favorites_count'].fillna(0,inplace=True)
test_data['verified'].fillna(-1,inplace=True)
test_data['statuses_count'].fillna(0,inplace=True)
test_data.drop('screen_name', axis=1, inplace=True)
test_data.drop('description', axis=1, inplace=True)
test_data.drop('location',axis=1, inplace=True)

X_test=test_data

X_train









    



/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:28: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:48: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:51: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:3295: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:61: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:62: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:63: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[12]:






  
    
      
      followers_count
      friends_count
      listedcount
      favourites_count
      verified
      statuses_count
      is_bot_screenname
      is_bot_description
      is_bot_location
      age
    
  
  
    
      0
      1291
      0
      10
      0
      0
      78554
      0
      0
      13
      125.0
    
    
      1
      1
      349
      0
      38
      0
      31
      0
      0
      25
      461.0
    
    
      2
      1086
      0
      14
      0
      0
      713
      1
      1
      0
      534.0
    
    
      3
      33
      0
      8
      0
      0
      676
      0
      0
      0
      802.0
    
    
      4
      11
      745
      0
      146
      0
      185
      0
      0
      21
      857.0
    
    
      5
      1
      186
      0
      0
      0
      11
      0
      0
      10
      55.0
    
    
      6
      193
      0
      19
      0
      0
      6068
      0
      1
      13
      1096.0
    
    
      7
      8227
      2
      89
      26
      0
      2597
      0
      1
      0
      689.0
    
    
      8
      275
      0
      17
      23
      0
      9922
      0
      1
      0
      833.0
    
    
      9
      51
      3
      9
      0
      0
      2515
      0
      0
      0
      670.0
    
    
      10
      51
      1
      12
      0
      0
      111
      0
      0
      11
      1003.0
    
    
      11
      2
      1
      4
      0
      0
      230
      0
      1
      0
      958.0
    
    
      12
      0
      29
      0
      0
      0
      0
      0
      0
      0
      56.0
    
    
      13
      1
      206
      0
      0
      0
      0
      0
      0
      0
      1597.0
    
    
      14
      0
      38
      0
      0
      0
      0
      0
      0
      0
      56.0
    
    
      15
      109
      0
      16
      0
      0
      16067
      0
      0
      0
      807.0
    
    
      16
      250
      0
      25
      0
      0
      31721
      0
      0
      0
      1057.0
    
    
      17
      15
      1941
      1
      319
      0
      406
      0
      0
      14
      857.0
    
    
      18
      190
      1899
      0
      27
      0
      0
      0
      0
      28
      61.0
    
    
      19
      181
      0
      24
      0
      0
      21506
      0
      1
      0
      950.0
    
    
      20
      10175
      11465
      199
      328
      0
      65022
      0
      0
      0
      1316.0
    
    
      21
      23
      4
      8
      4
      0
      6230
      0
      0
      7
      1100.0
    
    
      22
      106126
      0
      999
      2
      0
      30156
      1
      0
      19
      1319.0
    
    
      23
      2302
      3
      161
      4
      0
      7640
      1
      1
      5
      558.0
    
    
      24
      443
      2
      17
      6
      0
      9584
      0
      1
      6
      667.0
    
    
      25
      27
      1
      10
      0
      0
      145
      0
      0
      0
      1063.0
    
    
      26
      191
      2
      19
      0
      0
      5684
      0
      0
      8
      1079.0
    
    
      27
      1
      35
      1
      54
      0
      82
      0
      0
      0
      90.0
    
    
      28
      651
      0
      36
      0
      0
      19867
      0
      1
      0
      918.0
    
    
      29
      37
      45
      7
      0
      0
      191
      0
      0
      29
      977.0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2767
      1017758
      6985
      9473
      12901
      1
      13786
      0
      0
      15
      3161.0
    
    
      2768
      9
      97
      0
      0
      0
      1
      0
      0
      10
      1172.0
    
    
      2769
      2000567
      457
      9674
      901
      1
      55202
      0
      0
      25
      2881.0
    
    
      2770
      11
      53
      0
      1
      0
      13
      0
      0
      2
      2387.0
    
    
      2771
      1599
      922
      67
      212
      0
      11240
      0
      0
      24
      3017.0
    
    
      2772
      82
      91
      0
      634
      0
      704
      0
      0
      0
      1418.0
    
    
      2773
      178
      325
      6
      30
      0
      405
      0
      0
      16
      1707.0
    
    
      2774
      46
      233
      0
      935
      0
      285
      0
      0
      0
      2061.0
    
    
      2775
      67
      168
      0
      0
      0
      1851
      0
      0
      6
      2450.0
    
    
      2776
      147441
      20889
      0
      43
      1
      7648
      0
      0
      15
      3097.0
    
    
      2777
      420699
      108
      5248
      11
      1
      5568
      0
      0
      30
      2976.0
    
    
      2778
      63866
      21
      2037
      40
      1
      70
      0
      0
      0
      2790.0
    
    
      2779
      1715
      1342
      84
      3688
      0
      1395
      0
      0
      17
      2799.0
    
    
      2780
      5
      67
      0
      0
      0
      2
      0
      0
      21
      2074.0
    
    
      2781
      122
      88
      0
      325
      0
      1031
      0
      0
      13
      1468.0
    
    
      2782
      4025
      487
      117
      18
      1
      6672
      0
      0
      0
      2838.0
    
    
      2783
      52
      39
      0
      689
      0
      206
      0
      0
      6
      324.0
    
    
      2784
      30
      72
      4
      1
      0
      926
      0
      0
      27
      1744.0
    
    
      2785
      41776
      1158
      854
      1264
      1
      1096
      0
      0
      9
      2993.0
    
    
      2786
      52
      323
      2
      696
      0
      767
      0
      0
      12
      2679.0
    
    
      2787
      972032
      3072
      1587
      1423
      1
      22362
      0
      0
      10
      2828.0
    
    
      2788
      26
      150
      1
      8
      0
      2
      0
      0
      0
      2672.0
    
    
      2789
      215
      462
      16
      812
      0
      2384
      0
      0
      12
      3257.0
    
    
      2790
      64219
      667
      698
      2901
      1
      85
      0
      0
      14
      2854.0
    
    
      2791
      310
      0
      21
      3
      0
      3909
      1
      0
      0
      1272.0
    
    
      2792
      18998
      2005
      425
      2503
      0
      3498
      0
      0
      0
      2309.0
    
    
      2793
      32
      54
      0
      1
      0
      97
      0
      0
      11
      2269.0
    
    
      2794
      45044433
      7451
      68157
      24
      1
      9606
      0
      0
      18
      3215.0
    
    
      2795
      16
      64
      1
      15
      0
      62
      0
      0
      23
      1730.0
    
    
      2796
      22490
      308
      1342
      43
      1
      1897
      0
      0
      9
      3016.0
    
  

2797 rows × 10 columns



In [4]:

    
###Generating the output file for submission to kaggle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

rfc= RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=42)
rfc.fit(X_train,Y_train)
rfc_predictions=rfc.predict(X_test)
rfc_predictions

test_data['bot'] = np.array(rfc_predictions)

result = pd.DataFrame(test_data_df['id'])

result  = result.astype(np.int64)
result['bot'] = test_data['bot']
result
print (result['id'].dtype)
print (result['bot'].dtype)
print (result)

result.to_csv('output_new33.csv',  index=False)









    



int64
int64
                     id  bot
0            2281292622    0
1            2344040251    0
2             765871267    0
3            4772373433    0
4            1324548560    1
5            2561341789    1
6             347810134    1
7             856303860    1
8    832875000000000000    1
9              88856792    0
10   713557000000000000    1
11           1566746503    0
12             90420314    0
13            184910040    0
14            157690631    0
15             42420346    0
16             42382447    0
17             43993280    0
18           2305236733    1
19   742794000000000000    1
20             31348594    0
21            122085859    0
22             23573083    0
23             43152482    0
24            188857501    0
25           2911272579    0
26             35094637    0
27            146252766    0
28             85430866    0
29             55117855    0
..                  ...  ...
545  731201000000000000    1
546           612754791    1
547          4493562022    1
548          2897136909    1
549            78956001    0
550          4462343293    1
551          3830053332    1
552           586671909    0
553           332888068    0
554  756937000000000000    1
555          2163813157    1
556          2602312513    1
557  813000000000000000    0
558           355883433    0
559          3229506502    1
560            20322929    0
561            16712547    0
562           268439864    0
563          2566951536    1
564           174632702    1
565           843270408    1
566          2564439320    1
567           342737458    1
568           566078011    1
569           268809577    0
570  750999000000000000    1
571            10228272    0
572           218833868    1
573           176566242    0
574          3119554528    1

[575 rows x 2 columns]






    



/home/shivraj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [5]:

    
#Using 5 fold cross validation to check accuracy of RFC model
from sklearn.cross_validation import cross_val_score
accuracyScore = cross_val_score(rfc,X_train, Y_train, cv=5, scoring='accuracy')
print("Accuracy = ",accuracyScore.mean())









    



/home/shivraj/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    



Accuracy =  0.921344136939



In [6]:

    
#Using training data to evaluate the model.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


X_train,X_test,Y_train,Y_test=train_test_split(X_train,Y_train,test_size=0.15,random_state=50)
rfc= RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=42)
rfc.fit(X_train,Y_train)
rfc_predictions=rfc.predict(X_test)
rfc_predictions

#Printing the Confusion Matrix and Classification Report for RandomForestClassifier
#print
print(confusion_matrix(Y_test,rfc_predictions))
print('\n')
print(classification_report(Y_test,rfc_predictions))
print("accuracy score: {} ".format(accuracy_score(Y_test, rfc_predictions)))  
print("AUC: {}".format(roc_auc_score(Y_test,rfc_predictions)))









    



[[208  15]
 [ 20 177]]


             precision    recall  f1-score   support

          0       0.91      0.93      0.92       223
          1       0.92      0.90      0.91       197

avg / total       0.92      0.92      0.92       420

accuracy score: 0.9166666666666666 
AUC: 0.9156062916846874



In [ ]:



In [7]:

    
#Checking for top 3 features
from sklearn.feature_selection import RFE
rfe = RFE(rfc, 3)
fit = rfe.fit(X_train, Y_train)
print("Num Features: {}".format( fit.n_features_))
print("Selected Features:{}".format(fit.support_))
print("Feature Ranking: {}".format( fit.ranking_))

print(rfc.feature_importances_)









    



Num Features: 3
Selected Features:[ True  True False False False False False False False  True]
Feature Ranking: [1 1 5 2 4 3 8 7 6 1]
[ 0.0989613   0.16304229  0.05704616  0.09374155  0.08134576  0.07459192
  0.00959029  0.02213993  0.02360453  0.37593628]



In [13]:

    
#Feature rankings and scores
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features
X= X_new_train
y= Y_train

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. Feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    
#feature names array
fnames=['followersCount' ,'friendsCount' ,'listedcount' ,'favouritesCount', 'verified', 'statusesCount' ,'is_Bot_Screen' 'isBotDesc', 'isBotLoc' ,'age']
# Plot the feature importances of the forest
plt.figure(figsize=(18,5))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="b", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]),fnames )
plt.xlim([-1, X.shape[1]])
plt.show()









    



Feature ranking:
1. Feature 9 (0.350076)
2. Feature 4 (0.152645)
3. Feature 3 (0.079035)
4. Feature 7 (0.077435)
5. Feature 1 (0.071741)
6. Feature 5 (0.071428)
7. Feature 0 (0.061609)
8. Feature 8 (0.052398)
9. Feature 2 (0.047508)
10. Feature 6 (0.036126)



In [14]:

    
#importing matplot lib and seaborn library
import matplotlib.pyplot as plt
import seaborn as sns

bots = train_data[train_data_df.bot==1]
non_bots = train_data[train_data_df.bot==0]

#Plotting a graph to check if there is a correlation between Friends Count and Followers count for bots
plt.figure()
plt.title('Friends Count vs Followers Count for Bots')
sns.regplot(bots.friends_count, bots.followers_count,color='green', label='bots')
plt.xlim(0, 4000)
plt.ylim(0, 10000)
plt.show()



In [15]:

    
#Plotting a graph to check if there is a correlation between Friends Count and Followers count for nonbots
plt.title('Friends Count vs Followers Count for NonBots')
sns.regplot(non_bots.friends_count, non_bots.followers_count, color='red', label='nonBots')
plt.xlim(0, 4000)
plt.ylim(0, 10000)
plt.show()



In [16]:

    
#Plotting a graph to check if there is a correlation between Friends Count and Listed count for bots
plt.figure()
plt.title('Friends Count vs Listed Count for Bots')
sns.regplot(bots.friends_count, bots.listedcount, label='bots',color='green')
plt.xlim(0, 1000)
plt.ylim(0, 1000)
plt.show()



In [17]:

    
#Plotting a graph to check if there is a correlation between Friends Count and Followers count for nonbots
plt.title('Friends Count vs Listed Count for NonBots')
sns.regplot(non_bots.friends_count, non_bots.listedcount, label='nonBots',color='red')
plt.xlim(0, 1000)
plt.ylim(0, 1000)
plt.show()



In [18]:

    
#Plotting a graph to check if there is a correlation between Friends Count and age of profile for bots
plt.figure()
plt.title('Friends Count vs Age of Profile for Bots')
sns.regplot(bots.friends_count, bots.age,color='green', label='bots')
plt.xlim(0, 4000)
plt.ylim(0, 10000)
plt.show()



In [19]:

    
#Plotting a graph to check if there is a correlation between Friends Count and Age of profile for nonbots
plt.title('Friends Count vs Age of Profile for NonBots')
sns.regplot(non_bots.friends_count, non_bots.age, label='nonBots',color='red')
plt.xlim(0, 1000)
plt.ylim(0, 1000)
plt.show()



In [ ]:

	followers_count	friends_count	listedcount	favourites_count	verified	statuses_count	is_bot_screenname	is_bot_description	is_bot_location	age
0	1291	0	10	0	0	78554	0	0	13	125.0
1	1	349	0	38	0	31	0	0	25	461.0
2	1086	0	14	0	0	713	1	1	0	534.0
3	33	0	8	0	0	676	0	0	0	802.0
4	11	745	0	146	0	185	0	0	21	857.0
5	1	186	0	0	0	11	0	0	10	55.0
6	193	0	19	0	0	6068	0	1	13	1096.0
7	8227	2	89	26	0	2597	0	1	0	689.0
8	275	0	17	23	0	9922	0	1	0	833.0
9	51	3	9	0	0	2515	0	0	0	670.0
10	51	1	12	0	0	111	0	0	11	1003.0
11	2	1	4	0	0	230	0	1	0	958.0
12	0	29	0	0	0	0	0	0	0	56.0
13	1	206	0	0	0	0	0	0	0	1597.0
14	0	38	0	0	0	0	0	0	0	56.0
15	109	0	16	0	0	16067	0	0	0	807.0
16	250	0	25	0	0	31721	0	0	0	1057.0
17	15	1941	1	319	0	406	0	0	14	857.0
18	190	1899	0	27	0	0	0	0	28	61.0
19	181	0	24	0	0	21506	0	1	0	950.0
20	10175	11465	199	328	0	65022	0	0	0	1316.0
21	23	4	8	4	0	6230	0	0	7	1100.0
22	106126	0	999	2	0	30156	1	0	19	1319.0
23	2302	3	161	4	0	7640	1	1	5	558.0
24	443	2	17	6	0	9584	0	1	6	667.0
25	27	1	10	0	0	145	0	0	0	1063.0
26	191	2	19	0	0	5684	0	0	8	1079.0
27	1	35	1	54	0	82	0	0	0	90.0
28	651	0	36	0	0	19867	0	1	0	918.0
29	37	45	7	0	0	191	0	0	29	977.0
...	...	...	...	...	...	...	...	...	...	...
2767	1017758	6985	9473	12901	1	13786	0	0	15	3161.0
2768	9	97	0	0	0	1	0	0	10	1172.0
2769	2000567	457	9674	901	1	55202	0	0	25	2881.0
2770	11	53	0	1	0	13	0	0	2	2387.0
2771	1599	922	67	212	0	11240	0	0	24	3017.0
2772	82	91	0	634	0	704	0	0	0	1418.0
2773	178	325	6	30	0	405	0	0	16	1707.0
2774	46	233	0	935	0	285	0	0	0	2061.0
2775	67	168	0	0	0	1851	0	0	6	2450.0
2776	147441	20889	0	43	1	7648	0	0	15	3097.0
2777	420699	108	5248	11	1	5568	0	0	30	2976.0
2778	63866	21	2037	40	1	70	0	0	0	2790.0
2779	1715	1342	84	3688	0	1395	0	0	17	2799.0
2780	5	67	0	0	0	2	0	0	21	2074.0
2781	122	88	0	325	0	1031	0	0	13	1468.0
2782	4025	487	117	18	1	6672	0	0	0	2838.0
2783	52	39	0	689	0	206	0	0	6	324.0
2784	30	72	4	1	0	926	0	0	27	1744.0
2785	41776	1158	854	1264	1	1096	0	0	9	2993.0
2786	52	323	2	696	0	767	0	0	12	2679.0
2787	972032	3072	1587	1423	1	22362	0	0	10	2828.0
2788	26	150	1	8	0	2	0	0	0	2672.0
2789	215	462	16	812	0	2384	0	0	12	3257.0
2790	64219	667	698	2901	1	85	0	0	14	2854.0
2791	310	0	21	3	0	3909	1	0	0	1272.0
2792	18998	2005	425	2503	0	3498	0	0	0	2309.0
2793	32	54	0	1	0	97	0	0	11	2269.0
2794	45044433	7451	68157	24	1	9606	0	0	18	3215.0
2795	16	64	1	15	0	62	0	0	23	1730.0
2796	22490	308	1342	43	1	1897	0	0	9	3016.0