notebook.community

Edit and run



In [1]:

    
#importing pandas package
import pandas as pd
import numpy as np

#Input path where the data of the project resides
path_to_projectData = 'Data/' 

#Reading the TRAINING DATA CSV file
train_data_df = pd.read_csv(path_to_projectData + 'training_data_2_csv_UTF.csv') 

#SELECTING TOP 10 ROWS FROM THE TRAINING DATASET
train_data_df.head(10)









    Out[1]:






  
    
      
      id
      id_str
      screen_name
      location
      description
      url
      followers_count
      friends_count
      listedcount
      created_at
      favourites_count
      verified
      statuses_count
      lang
      status
      default_profile
      default_profile_image
      has_extended_profile
      name
      bot
    
  
  
    
      0
      8.160000e+17
      "815745789754417152"
      "HoustonPokeMap"
      "Houston, TX"
      "Rare and strong PokŽmon in Houston, TX. See m...
      "https://t.co/dnWuDbFRkt"
      1291
      0
      10
      "Mon Jan 02 02:25:26 +0000 2017"
      0
      False
      78554
      "en"
      {\r      "created_at": "Sun Mar 12 15:44:04 +0...
      True
      False
      False
      "Houston PokŽ Alert"
      1
    
    
      1
      4.843621e+09
      4843621225
      kernyeahx
      Templeville town, MD, USA
      From late 2014 Socium Marketplace will make sh...
      NaN
      1
      349
      0
      2/1/2016 7:37
      38
      False
      31
      en
      null
      True
      False
      False
      Keri Nelson
      1
    
    
      2
      4.303727e+09
      4303727112
      mattlieberisbot
      NaN
      Inspired by the smart, funny folks at @replyal...
      https://t.co/P1e1o0m4KC
      1086
      0
      14
      Fri Nov 20 18:53:22 +0000 2015
      0
      False
      713
      en
      {'retweeted': False, 'is_quote_status': False,...
      True
      False
      False
      Matt Lieber Is Bot
      1
    
    
      3
      3.063139e+09
      3063139353
      sc_papers
      NaN
      NaN
      NaN
      33
      0
      8
      2/25/2015 20:11
      0
      False
      676
      en
      Construction of human anti-tetanus single-chai...
      True
      True
      False
      single cell papers
      1
    
    
      4
      2.955142e+09
      2955142070
      lucarivera16
      Dublin, United States
      Inspiring cooks everywhere since 1956.
      NaN
      11
      745
      0
      1/1/2015 17:44
      146
      False
      185
      en
      null
      False
      False
      False
      lucarivera16
      1
    
    
      5
      8.410000e+17
      8.41E+17
      dantheimprover
      Austin, TX
      Just a guy trying to do good by telling everyo...
      NaN
      1
      186
      0
      13/03/2017 22:53
      0
      False
      11
      en
      Status(_api=<tweepy.api.API object at 0x101927...
      True
      False
      True
      dantheimprover
      1
    
    
      6
      2.482835e+09
      2482834658
      _all_of_us_
      in a machine.
      bot by @rubicon
      NaN
      193
      0
      19
      Wed May 07 22:29:25 +0000 2014
      0
      False
      6068
      en
      {u'contributors': None, u'truncated': False, u...
      False
      False
      False
      everything always
      1
    
    
      7
      3.333574e+09
      3333573622
      KatamariItems
      NaN
      [Bot rolled up by @BeachEpisode] Cataloguing e...
      NaN
      8227
      2
      89
      Thu Jun 18 22:07:31 +0000 2015
      26
      False
      2597
      en
      {u'contributors': None, u'truncated': False, u...
      True
      False
      False
      Katamari Collection
      1
    
    
      8
      2.996105e+09
      2996105102
      AutophagyPapers
      NaN
      Twitterbot for #Autophagy papers. Curated by @...
      NaN
      275
      0
      17
      1/25/2015 17:34
      23
      False
      9922
      en
      Feeding Schedule And Proteolysis Regulate Auto...
      False
      False
      False
      Autophagy Papers
      1
    
    
      9
      3.271096e+09
      3271095818
      HSC_papers
      NaN
      NaN
      NaN
      51
      3
      9
      7/7/2015 15:23
      0
      False
      2515
      en
      Functional Selectivity in Cytokine Signaling R...
      True
      False
      False
      Hematopoiesis
      1



In [2]:

    
# selecting attributes which have textual, numeric values and categorical values
train_data= train_data_df[['screen_name','location','description','url',
       'followers_count', 'friends_count', 'listedcount','created_at',
       'favourites_count','statuses_count','name']]

#Looking for NaN and NULL values in the following features of the dataframe, and using lambda as defined to replace them with 0
train_data['followers_count']=train_data['followers_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['friends_count']=train_data['friends_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['list_count']=train_data['listedcount'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['fav_count']=train_data['favourites_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)

#Dropping columns that have been renamed (the XGBOOST classifier which will be used later found discrepancy in the training and test data feature names, which made this step required)
train_data.drop('listedcount', axis=1, inplace=True)
train_data.drop('favourites_count', axis=1, inplace=True)

#y = lambda symbol: 'X' if symbol==True else 'O' if symbol==False else ' '



#Looking for NaN and NULL values in the location  feature of the dataframe, replacing the respective with "" empty strings
train_data['location']=train_data['location'].apply(lambda x: "" if type(x)==float else "" if x=='NaN'
                                                          else "" if x=='None' else x)

#Computing a new column  is_bot_location to replace the textual feature - screenname
train_data['is_bot_location'] = 0
train_data['is_bot_location'] = train_data['location'].apply(lambda x:0 if x=="" else len(x))

#Dropping the location feature in the dataframe as we have replaced it with a numerical version is_bot_location
train_data.drop('location', axis=1, inplace=True)



#Computing a new column 'is_bot_screenname' to replace the textual feature - screenname
train_data['is_bot_screenname']=0
screenname = train_data['screen_name']
flag = []
#Looking for the word bot/BOT/Bot in screenname and appending the length of the screenname, if we have a success, else 0
for x in screenname:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
train_data['is_bot_screenname']=flag
#Dropping the screen_name feature in the dataframe as we have replaced it with a numerical version is_bot_screenname
train_data.drop('screen_name', axis=1, inplace=True)



#Computing a new column 'is_bot_description' to replace the textual feature - description
train_data['is_bot_description']=0
description = train_data['description']
flag = []
#Looking for the word bot/BOT/Bot in 'description' and appending the length of the description, if we have a success, else 0
for x in description:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
train_data['is_bot_description']=flag
#Dropping the 'description' feature in the dataframe as we have replaced it with a numerical version is_bot_description
train_data.drop('description', axis=1, inplace=True)



#Computing a new column 'is_bot_name' to replace the textual feature - name
train_data['is_bot_name']=0
description = train_data['name']
flag = []
for x in description:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
train_data['is_bot_name']=flag
#Dropping the 'name' feature in the dataframe as we have replaced it with a numerical version is_bot_name
train_data.drop('name', axis=1, inplace=True)


#Computing the number of days since the profile has been created
timestamp = train_data_df['created_at']
train_data['age_of_profile']=0
profile_age=[]
from datetime import datetime
#Preparing a list of date formats which are currently present in the 'created_at' feature of our dataframe
DATE_FORMATS = ['%a %b %d %H:%M:%S +0000 %Y','"%a %b %d %H:%M:%S +0000 %Y"','%Y-%m-%d %H:%M:%S','%d/%m/%Y %H:%M' , '%m/%d/%Y %H:%M']

#Looping through each row, and each date 
#CHECK IF THE FORMAT IS IN ACCORDANCE TO THE LIST DECLARED ABOVE
#Compute the current date and find the difference with the computed 'created_at'
for i in timestamp:
    try:
        for dateformat in DATE_FORMATS:
            try:
                my_date = datetime.strptime(i,dateformat)
                break
            except ValueError:
                pass
        today = datetime.today()
        difference = today - my_date
        profile_age.append(difference.days)
    except ValueError:
        pass

train_data['age_of_profile']=profile_age
#Dropping the feature 'created_at' in the dataframe as we have replaced it with a numerical version age_of_profile
train_data.drop('created_at', axis=1, inplace=True)


#Computing the URL length
train_data['is_url']=0
train_data['url'].fillna(0,inplace=True)

description = train_data['url']
flag = []
for x in description:
    x = str(x)
    if x == 'None' or x=='0':
        flag.append(0)
    else:
        flag.append(len(x))
train_data['is_url']=flag
#Dropping the feature 'created_at' in the dataframe as we have replaced it with a numerical version age_of_profile
train_data.drop('url', axis=1, inplace=True)


#PREPARING THE X_TRAIN DATA WITH THE NEW COMPUTED TRAIN_DATA VERSION
X_train=train_data

#PREPARING Y_TRAIN WITH ONLY THE BOT COLUMN DATA FROM THE DATAFRAME
Y_train=train_data_df['bot']

#DISPLAYING ALL THE ROWS OF X_TRAIN
X_train









    



/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:29: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:46: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:51: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:61: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:63: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:68: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:77: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:79: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:84: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:107: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:109: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:113: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:3295: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:124: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:126: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[2]:






  
    
      
      followers_count
      friends_count
      statuses_count
      list_count
      fav_count
      is_bot_location
      is_bot_screenname
      is_bot_description
      is_bot_name
      age_of_profile
      is_url
    
  
  
    
      0
      1291
      0
      78554
      10
      0
      13
      0
      0
      0
      126
      25
    
    
      1
      1
      349
      31
      0
      38
      25
      0
      0
      0
      492
      0
    
    
      2
      1086
      0
      713
      14
      0
      0
      15
      128
      18
      534
      23
    
    
      3
      33
      0
      676
      8
      0
      0
      0
      0
      0
      802
      0
    
    
      4
      11
      745
      185
      0
      146
      21
      0
      0
      0
      857
      0
    
    
      5
      1
      186
      11
      0
      0
      10
      0
      0
      0
      55
      0
    
    
      6
      193
      0
      6068
      19
      0
      13
      0
      15
      0
      1096
      0
    
    
      7
      8227
      2
      2597
      89
      26
      0
      0
      148
      0
      689
      0
    
    
      8
      275
      0
      9922
      17
      23
      0
      0
      55
      0
      833
      0
    
    
      9
      51
      3
      2515
      9
      0
      0
      0
      0
      0
      670
      0
    
    
      10
      51
      1
      111
      12
      0
      11
      0
      0
      0
      1003
      23
    
    
      11
      2
      1
      230
      4
      0
      0
      0
      107
      0
      959
      0
    
    
      12
      0
      29
      0
      0
      0
      0
      0
      0
      0
      57
      0
    
    
      13
      1
      206
      0
      0
      0
      0
      0
      0
      0
      1598
      0
    
    
      14
      0
      38
      0
      0
      0
      0
      0
      0
      0
      57
      0
    
    
      15
      109
      0
      16067
      16
      0
      0
      0
      0
      0
      808
      0
    
    
      16
      250
      0
      31721
      25
      0
      0
      0
      0
      0
      1058
      0
    
    
      17
      15
      1941
      406
      1
      319
      14
      0
      0
      0
      858
      0
    
    
      18
      190
      1899
      0
      0
      27
      28
      0
      0
      0
      -57
      0
    
    
      19
      181
      0
      21506
      24
      0
      0
      0
      86
      0
      951
      23
    
    
      20
      10175
      11465
      65022
      199
      328
      0
      0
      0
      0
      1316
      0
    
    
      21
      23
      4
      6230
      8
      4
      7
      0
      0
      0
      1100
      0
    
    
      22
      106126
      0
      30156
      999
      2
      19
      13
      0
      0
      1319
      23
    
    
      23
      2302
      3
      7640
      161
      4
      5
      12
      136
      12
      558
      23
    
    
      24
      443
      2
      9584
      17
      6
      14
      0
      81
      0
      578
      23
    
    
      25
      27
      1
      145
      10
      0
      0
      0
      0
      0
      1064
      0
    
    
      26
      191
      2
      5684
      19
      0
      8
      0
      0
      0
      1079
      0
    
    
      27
      1
      35
      82
      1
      54
      0
      0
      0
      0
      90
      0
    
    
      28
      651
      0
      19867
      36
      0
      0
      0
      130
      0
      1212
      0
    
    
      29
      37
      45
      191
      7
      0
      29
      0
      0
      0
      978
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2767
      1017758
      6985
      13786
      9473
      12901
      15
      0
      0
      0
      3161
      22
    
    
      2768
      9
      97
      1
      0
      0
      10
      0
      0
      0
      1172
      0
    
    
      2769
      2000567
      457
      55202
      9674
      901
      27
      0
      0
      0
      2881
      23
    
    
      2770
      11
      53
      13
      0
      1
      2
      0
      0
      0
      2388
      4
    
    
      2771
      1599
      922
      11240
      67
      212
      24
      0
      0
      0
      3017
      22
    
    
      2772
      82
      91
      704
      0
      634
      0
      0
      0
      0
      1419
      0
    
    
      2773
      178
      325
      405
      6
      30
      16
      0
      0
      0
      1708
      22
    
    
      2774
      46
      233
      285
      0
      935
      0
      0
      0
      0
      2061
      0
    
    
      2775
      67
      168
      1851
      0
      0
      6
      0
      0
      0
      2450
      0
    
    
      2776
      147441
      20889
      7648
      0
      43
      15
      0
      0
      0
      3097
      0
    
    
      2777
      420699
      108
      5568
      5248
      11
      30
      0
      0
      0
      2977
      22
    
    
      2778
      63866
      21
      70
      2037
      40
      0
      0
      0
      0
      2790
      23
    
    
      2779
      1715
      1342
      1395
      84
      3688
      17
      0
      0
      0
      2860
      23
    
    
      2780
      5
      67
      2
      0
      0
      21
      0
      0
      0
      2279
      0
    
    
      2781
      122
      88
      1031
      0
      325
      13
      0
      0
      0
      1469
      0
    
    
      2782
      4025
      487
      6672
      117
      18
      0
      0
      0
      0
      2838
      22
    
    
      2783
      52
      39
      206
      0
      689
      6
      0
      0
      0
      324
      23
    
    
      2784
      30
      72
      926
      4
      1
      27
      0
      0
      0
      1744
      24
    
    
      2785
      41776
      1158
      1096
      854
      1264
      9
      0
      0
      0
      2993
      0
    
    
      2786
      52
      323
      767
      2
      696
      12
      0
      0
      0
      2563
      0
    
    
      2787
      972032
      3072
      22362
      1587
      1423
      10
      0
      0
      0
      2829
      0
    
    
      2788
      26
      150
      2
      1
      8
      0
      0
      0
      0
      2672
      0
    
    
      2789
      215
      462
      2384
      16
      812
      12
      0
      0
      0
      3258
      23
    
    
      2790
      64219
      667
      85
      698
      2901
      14
      0
      0
      0
      2854
      16
    
    
      2791
      310
      0
      3909
      21
      3
      0
      11
      0
      0
      1243
      22
    
    
      2792
      18998
      2005
      3498
      425
      2503
      0
      0
      0
      0
      2046
      0
    
    
      2793
      32
      54
      97
      0
      1
      11
      0
      0
      0
      2270
      22
    
    
      2794
      45044433
      7451
      9606
      68157
      24
      18
      0
      0
      0
      3215
      22
    
    
      2795
      16
      64
      62
      1
      15
      23
      0
      0
      0
      1641
      0
    
    
      2796
      22490
      308
      1897
      1342
      43
      9
      0
      0
      0
      3016
      22
    
  

2797 rows × 11 columns



In [3]:

    
#Reading the TEST data CSV file
test_data_df= pd.read_csv(path_to_projectData + 'test_data_4_students.csv')

test_data_df=test_data_df.head(575)

# Selecting Attributes which have textual, numeric values and categorical values
test_data= test_data_df[['screen_name','location','description','url',
       'followers_count', 'friends_count', 'listed_count','created_at',
       'favorites_count','statuses_count','name']]

#Looking for NaN and NULL values in the following feature of the dataframe, and using lambda as defined to replace them with 0
#df.Age.apply(lambda x: x if not pd.isnull(x) else 'Is Null value')
test_data['followers_count']=test_data['followers_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['friends_count']=test_data['friends_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['list_count']=test_data['listed_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['fav_count']=test_data['favorites_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
#Dropping columns that have been renamed (the XGBOOST classifier which will be used later found discrepancy in the training and test data feature names, which made this step required)
test_data.drop('listed_count', axis=1, inplace=True)
test_data.drop('favorites_count', axis=1, inplace=True)
#y = lambda symbol: 'X' if symbol==True else 'O' if symbol==False else ' '
test_data['followers_count'].fillna(0,inplace=True)
test_data['friends_count'].fillna(0,inplace=True)
test_data['statuses_count'].fillna(0,inplace=True)

#Looking for NaN and NULL values in the location  feature of the dataframe, replacing the respective with "" empty strings
test_data['location']=test_data['location'].apply(lambda x: "" if type(x)==float else "" if x=='NaN'
                                                          else "" if x=='None' else x)

#Computing a new column  is_bot_location to replace the textual feature - screenname
test_data['is_bot_location'] = 0
test_data['is_bot_location'] = test_data['location'].apply(lambda x:0 if x=="" else len(x))

#Dropping the location feature in the dataframe as we have replaced it with a numerical version is_bot_location
test_data.drop('location', axis=1, inplace=True)



#Computing a new column 'is_bot_screenname' to replace the textual feature - screenname
test_data['is_bot_screenname']=0
screenname = test_data['screen_name']
flag = []
#Looking for the word bot/BOT/Bot in screenname and appending the length of the screenname, if we have a success, else 0
for x in screenname:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
test_data['is_bot_screenname']=flag
#Dropping the screen_name feature in the dataframe as we have replaced it with a numerical version is_bot_screenname
test_data.drop('screen_name', axis=1, inplace=True)



#Computing a new column 'is_bot_description' to replace the textual feature - description
test_data['is_bot_description']=0
description = test_data['description']
flag = []
#Looking for the word bot/BOT/Bot in 'description' and appending the length of the description, if we have a success, else 0
for x in description:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
test_data['is_bot_description']=flag
#Dropping the 'description' feature in the dataframe as we have replaced it with a numerical version is_bot_description
test_data.drop('description', axis=1, inplace=True)



#Computing a new column 'is_bot_name' to replace the textual feature - name
test_data['is_bot_name']=0
description = test_data['name']
flag = []
for x in description:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
test_data['is_bot_name']=flag
#Dropping the 'name' feature in the dataframe as we have replaced it with a numerical version is_bot_name
test_data.drop('name', axis=1, inplace=True)


#Computing the number of days since the profile has been created
timestamp = test_data['created_at']
test_data['age_of_profile']=0
profile_age=[]
from datetime import datetime

#Preparing a list of date formats which are currently present in the 'created_at' feature of our dataframe
DATE_FORMATS = ['%a %b %d %H:%M:%S +0000 %Y','"%a %b %d %H:%M:%S +0000 %Y"','%Y-%m-%d %H:%M:%S','%d/%m/%Y %H:%M' , '%m/%d/%Y %H:%M']

#Looping through each row, and each date 
#CHECK IF THE FORMAT IS IN ACCORDANCE TO THE LIST DECLARED ABOVE
#Compute the current date and find the difference with the computed 'created_at'
#computing the number of days since profile creation

for i in timestamp:
    try:
        for dateformat in DATE_FORMATS:
            try:
                my_date = datetime.strptime(i,dateformat)
                break
            except ValueError:
                pass
        today = datetime.today()
        difference = today - my_date
        profile_age.append(difference.days)
    except ValueError:
        pass

test_data['age_of_profile']=profile_age
#Dropping the feature 'created_at' in the dataframe as we have replaced it with a numerical version age_of_profile
test_data.drop('created_at', axis=1, inplace=True)


#Computing the URL length
test_data['is_url']=0
test_data['url'].fillna(0,inplace=True)

description = test_data['url']
flag = []
for x in description:
    x = str(x)
    if x == 'None' or x=='0':
        flag.append(0)
    else:
        flag.append(len(x))
test_data['is_url']=flag
#Dropping the feature 'created_at' in the dataframe as we have replaced it with a numerical version age_of_profile
test_data.drop('url', axis=1, inplace=True)

#PREPARING THE X_TEST DATA WITH THE NEW COMPUTED TRAIN_DATA VERSION
X_test=test_data

#DISPLAYING ALL THE ROWS OF X_TEST
X_test









    



/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:39: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:49: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:51: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:56: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:66: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:68: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:73: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:82: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:84: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:89: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:115: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:117: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:121: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:134: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[3]:






  
    
      
      followers_count
      friends_count
      statuses_count
      list_count
      fav_count
      is_bot_location
      is_bot_screenname
      is_bot_description
      is_bot_name
      age_of_profile
      is_url
    
  
  
    
      0
      4466
      1295
      3036.0
      111
      1579
      8
      0
      0
      0
      1216
      23
    
    
      1
      295
      1016
      618.0
      10
      300
      15
      0
      0
      0
      1178
      0
    
    
      2
      1001678
      3017
      3329.0
      14
      13040
      0
      0
      0
      0
      1723
      23
    
    
      3
      445
      487
      46.0
      17
      1112
      0
      0
      0
      0
      477
      23
    
    
      4
      187
      68
      690359.0
      13
      0
      33
      13
      797
      42
      1495
      22
    
    
      5
      80
      87
      20167.0
      0
      0
      0
      9
      182
      36
      1061
      0
    
    
      6
      2020
      1978
      968182.0
      56
      0
      20
      8
      841
      0
      2105
      23
    
    
      7
      70
      80
      76735.0
      2
      0
      0
      11
      489
      18
      1680
      0
    
    
      8
      181
      144
      1960.0
      2
      0
      40
      15
      928
      0
      79
      23
    
    
      9
      20419393
      9
      468.0
      6
      0
      6
      0
      0
      0
      2736
      23
    
    
      10
      20423
      8
      17387.0
      130
      0
      0
      15
      122
      15
      408
      23
    
    
      11
      1436
      2017
      967.0
      55
      1099
      12
      0
      0
      0
      1404
      23
    
    
      12
      26376073
      91
      2666.0
      60767
      0
      13
      0
      0
      0
      2729
      23
    
    
      13
      27950120
      157
      287.0
      32993
      0
      6
      0
      0
      0
      2442
      23
    
    
      14
      5210
      2555
      5501.0
      232
      1169
      11
      0
      0
      0
      2513
      22
    
    
      15
      16328463
      1071
      14596.0
      10848
      0
      11
      0
      0
      0
      2904
      23
    
    
      16
      964
      331
      6389.0
      15
      2832
      0
      0
      0
      0
      2905
      23
    
    
      17
      5254
      2407
      3392.0
      102
      3082
      0
      0
      0
      0
      3043
      23
    
    
      18
      446
      431
      119680.0
      4
      0
      30
      12
      460
      0
      1201
      0
    
    
      19
      1404
      0
      141.0
      2
      12
      0
      9
      35
      7
      327
      23
    
    
      20
      16933838
      21
      3673.0
      14682
      0
      0
      0
      0
      0
      2945
      23
    
    
      21
      8619
      3229
      38918.0
      190
      50921
      12
      0
      0
      0
      2614
      23
    
    
      22
      15610695
      36
      82958.0
      15248
      0
      5
      0
      0
      0
      2981
      23
    
    
      23
      15836207
      1046
      27660.0
      32067
      0
      0
      0
      0
      0
      2901
      23
    
    
      24
      14749
      1843
      20076.0
      584
      2389
      16
      0
      0
      0
      2432
      0
    
    
      25
      191
      228
      24.0
      5
      164
      17
      0
      0
      0
      881
      23
    
    
      26
      25557131
      672
      7051.0
      51993
      0
      13
      0
      0
      0
      2935
      23
    
    
      27
      13206
      964
      14115.0
      496
      3426
      16
      0
      0
      0
      2544
      23
    
    
      28
      1641
      812
      3385.0
      91
      1493
      12
      0
      0
      0
      2750
      23
    
    
      29
      4545529
      2042
      9370.0
      12664
      13260
      3
      0
      0
      0
      2800
      23
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      545
      371
      4
      1892.0
      0
      3
      12
      12
      0
      0
      359
      23
    
    
      546
      263
      397
      62000.0
      1
      0
      38
      0
      474
      0
      1783
      0
    
    
      547
      50
      1
      4200.0
      0
      1
      0
      0
      68
      0
      509
      0
    
    
      548
      549
      21
      245021.0
      298
      6
      0
      0
      141
      0
      891
      0
    
    
      549
      4067
      360
      4061.0
      152
      3402
      12
      0
      0
      0
      2775
      0
    
    
      550
      486
      116
      306390.0
      12
      0
      56
      6
      290
      0
      520
      23
    
    
      551
      331
      375
      4313.0
      18
      49
      8
      0
      160
      0
      636
      22
    
    
      552
      16779295
      0
      3698.0
      9213
      0
      0
      0
      0
      0
      1812
      23
    
    
      553
      892
      355
      1028.0
      16
      500
      15
      0
      0
      0
      2039
      22
    
    
      554
      25
      4
      1323.0
      2
      0
      0
      0
      156
      0
      288
      0
    
    
      555
      993
      47
      18369.0
      1047
      13
      0
      12
      0
      13
      1286
      0
    
    
      556
      210
      200
      68691.0
      13
      1442
      0
      15
      104
      9
      1157
      23
    
    
      557
      231
      127
      87.0
      1
      64
      0
      0
      0
      0
      134
      23
    
    
      558
      980
      107
      14979.0
      59
      4
      17
      0
      74
      0
      2092
      22
    
    
      559
      19
      7
      26181.0
      1
      20
      0
      15
      127
      15
      795
      23
    
    
      560
      29145797
      800
      42988.0
      27906
      0
      9
      0
      0
      0
      3011
      23
    
    
      561
      297
      191
      296.0
      12
      32
      0
      0
      0
      0
      3070
      23
    
    
      562
      16296072
      23
      973.0
      20704
      0
      0
      0
      0
      0
      2242
      22
    
    
      563
      15
      0
      70833.0
      0
      0
      80
      12
      300
      60
      1059
      0
    
    
      564
      3065
      3517
      74560.0
      94
      0
      26
      14
      557
      0
      2469
      0
    
    
      565
      120
      71
      51891.0
      3
      0
      0
      13
      535
      0
      1687
      0
    
    
      566
      113
      226
      24115.0
      3
      0
      0
      10
      105
      106
      1060
      0
    
    
      567
      66
      84
      234290.0
      5
      0
      0
      9
      0
      23
      2112
      0
    
    
      568
      47
      16
      178704.0
      3
      0
      13
      8
      572
      0
      1835
      0
    
    
      569
      26841
      3359
      90417.0
      491
      130876
      6
      0
      0
      0
      2241
      23
    
    
      570
      1551
      138
      37318.0
      3119
      1
      15
      11
      159
      7
      305
      23
    
    
      571
      67697733
      998
      19364.0
      82023
      0
      13
      0
      0
      0
      3463
      23
    
    
      572
      1257
      363
      388672.0
      75
      0
      32
      15
      504
      0
      2358
      22
    
    
      573
      21585872
      2279
      3318.0
      128136
      0
      3
      0
      0
      0
      2463
      23
    
    
      574
      64
      80
      43438.0
      3
      0
      0
      15
      130
      17
      773
      23
    
  

575 rows × 11 columns



In [4]:

    
#IMPORTING THE XGBOOST CLASSIFIER TO PREDICT OUR TWITTER PROFILES
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np

# CONVERTING THE X_TRAIN AND Y_TRAIN ELEMENTS INTO FLOAT32
X_train = X_train.astype('float32')
X_test=X_test.astype('float32')

# Prepare the inputs for the model
xgb= xgb.XGBClassifier(learning_rate = 0.03,
 n_estimators= 300,
 max_depth= 3,
 min_child_weight= 2,
 subsample=0.4,
 colsample_bytree=0.2,
 objective= 'binary:logistic',
 nthread= 0,
 scale_pos_weight=1)
clf = xgb.fit(X_train,Y_train)

predictions = clf.predict(X_test)
predictions

test_data['bot'] = np.array(predictions)

result = pd.DataFrame(test_data_df['id'])

result  = result.astype(np.int64)
result['Bot'] = test_data['bot']
result
print (result['id'].dtype)
print (result['Bot'].dtype)
print (result)

#EXPORTING RESULTS
result.to_csv('output_xgb_0508_2.csv',  index=False)









    



/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    



int64
int64
                     id  Bot
0            2281292622    0
1            2344040251    0
2             765871267    0
3            4772373433    0
4            1324548560    1
5            2561341789    1
6             347810134    1
7             856303860    1
8    832875000000000000    1
9              88856792    0
10   713557000000000000    1
11           1566746503    0
12             90420314    0
13            184910040    0
14            157690631    0
15             42420346    0
16             42382447    0
17             43993280    0
18           2305236733    1
19   742794000000000000    1
20             31348594    0
21            122085859    0
22             23573083    0
23             43152482    0
24            188857501    0
25           2911272579    0
26             35094637    0
27            146252766    0
28             85430866    0
29             55117855    0
..                  ...  ...
545  731201000000000000    1
546           612754791    1
547          4493562022    1
548          2897136909    1
549            78956001    0
550          4462343293    1
551          3830053332    1
552           586671909    0
553           332888068    0
554  756937000000000000    1
555          2163813157    1
556          2602312513    1
557  813000000000000000    0
558           355883433    1
559          3229506502    1
560            20322929    0
561            16712547    0
562           268439864    0
563          2566951536    1
564           174632702    1
565           843270408    1
566          2564439320    1
567           342737458    1
568           566078011    1
569           268809577    0
570  750999000000000000    1
571            10228272    0
572           218833868    1
573           176566242    0
574          3119554528    1

[575 rows x 2 columns]






    



/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [5]:

    
# ##### Importing the RandomForestClassifier and predicting whether the profile is a bot or not using it
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import roc_auc_score

# rfc= RandomForestClassifier(n_estimators=200,oob_score= True,n_jobs=-1,random_state=42,max_features="auto"
#                             ,min_samples_leaf=1,max_depth=15)
# rfc.fit(X_train,Y_train)
# rfc_predictions=rfc.predict(X_test)
# rfc_predictions

# test_data['bot'] = np.array(rfc_predictions)

# result = pd.DataFrame(test_data_df['id'])

# result  = result.astype(np.int64)
# result['bot'] = test_data['bot']
# result
# print (result['id'].dtype)
# print (result['bot'].dtype)
# print (result)

# result.to_csv('output_rfc_new.csv',  index=False)

# # # # #Printing the Confusion Matrix and Classification Report for RandomForestClassifier
# # # # #print
# # # # #print(confusion_matrix(Y_test,rfc_predictions))
# # # # #print('\n')
# # # # #print(classification_report(Y_test,rfc_predictions))
# # # # #print("accuracy score: {} ".format(accuracy_score(Y_test, rfc_predictions)))  
# # # # #print("AUC: {}".format(roc_auc_score(Y_test,rfc_predictions)))



In [6]:

    
from sklearn.cross_validation import cross_val_score
accuracyScore = cross_val_score(xgb,X_train, Y_train, cv=5, scoring='accuracy')
print("Accuracy = ",accuracyScore.mean())









    



('Accuracy = ', 0.90882560212245578)



In [7]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

X_train,X_test,Y_train,Y_test=train_test_split(X_train,Y_train,test_size=0.15,random_state=50)
rfc= RandomForestClassifier(n_estimators=200,oob_score= True,n_jobs=-1,random_state=50,max_features="auto"
                            ,min_samples_leaf=1,max_depth=15)
rfc.fit(X_train,Y_train)
rfc_predictions=rfc.predict(X_test)
rfc_predictions
#Printing the Confusion Matrix and Classification Report for RandomForestClassifier
#print
print(confusion_matrix(Y_test,rfc_predictions))
print('\n')
print(classification_report(Y_test,rfc_predictions))
print("accuracy score: {} ".format(accuracy_score(Y_test, rfc_predictions)))  
print("AUC: {}".format(roc_auc_score(Y_test,rfc_predictions)))









    



[[210  13]
 [ 21 176]]


             precision    recall  f1-score   support

          0       0.91      0.94      0.93       223
          1       0.93      0.89      0.91       197

avg / total       0.92      0.92      0.92       420

accuracy score: 0.919047619048 
AUC: 0.917552525551



In [ ]:

	id	id_str	screen_name	location	description	url	followers_count	friends_count	listedcount	created_at	favourites_count	verified	statuses_count	lang	status	default_profile	default_profile_image	has_extended_profile	name	bot
0	8.160000e+17	"815745789754417152"	"HoustonPokeMap"	"Houston, TX"	"Rare and strong PokŽmon in Houston, TX. See m...	"https://t.co/dnWuDbFRkt"	1291	0	10	"Mon Jan 02 02:25:26 +0000 2017"	0	False	78554	"en"	{\r "created_at": "Sun Mar 12 15:44:04 +0...	True	False	False	"Houston PokŽ Alert"	1
1	4.843621e+09	4843621225	kernyeahx	Templeville town, MD, USA	From late 2014 Socium Marketplace will make sh...	NaN	1	349	0	2/1/2016 7:37	38	False	31	en	null	True	False	False	Keri Nelson	1
2	4.303727e+09	4303727112	mattlieberisbot	NaN	Inspired by the smart, funny folks at @replyal...	https://t.co/P1e1o0m4KC	1086	0	14	Fri Nov 20 18:53:22 +0000 2015	0	False	713	en	{'retweeted': False, 'is_quote_status': False,...	True	False	False	Matt Lieber Is Bot	1
3	3.063139e+09	3063139353	sc_papers	NaN	NaN	NaN	33	0	8	2/25/2015 20:11	0	False	676	en	Construction of human anti-tetanus single-chai...	True	True	False	single cell papers	1
4	2.955142e+09	2955142070	lucarivera16	Dublin, United States	Inspiring cooks everywhere since 1956.	NaN	11	745	0	1/1/2015 17:44	146	False	185	en	null	False	False	False	lucarivera16	1
5	8.410000e+17	8.41E+17	dantheimprover	Austin, TX	Just a guy trying to do good by telling everyo...	NaN	1	186	0	13/03/2017 22:53	0	False	11	en	Status(_api=<tweepy.api.API object at 0x101927...	True	False	True	dantheimprover	1
6	2.482835e+09	2482834658	_all_of_us_	in a machine.	bot by @rubicon	NaN	193	0	19	Wed May 07 22:29:25 +0000 2014	0	False	6068	en	{u'contributors': None, u'truncated': False, u...	False	False	False	everything always	1
7	3.333574e+09	3333573622	KatamariItems	NaN	[Bot rolled up by @BeachEpisode] Cataloguing e...	NaN	8227	2	89	Thu Jun 18 22:07:31 +0000 2015	26	False	2597	en	{u'contributors': None, u'truncated': False, u...	True	False	False	Katamari Collection	1
8	2.996105e+09	2996105102	AutophagyPapers	NaN	Twitterbot for #Autophagy papers. Curated by @...	NaN	275	0	17	1/25/2015 17:34	23	False	9922	en	Feeding Schedule And Proteolysis Regulate Auto...	False	False	False	Autophagy Papers	1
9	3.271096e+09	3271095818	HSC_papers	NaN	NaN	NaN	51	3	9	7/7/2015 15:23	0	False	2515	en	Functional Selectivity in Cytokine Signaling R...	True	False	False	Hematopoiesis	1

	followers_count	friends_count	statuses_count	list_count	fav_count	is_bot_location	is_bot_screenname	is_bot_description	is_bot_name	age_of_profile	is_url
0	4466	1295	3036.0	111	1579	8	0	0	0	1216	23
1	295	1016	618.0	10	300	15	0	0	0	1178	0
2	1001678	3017	3329.0	14	13040	0	0	0	0	1723	23
3	445	487	46.0	17	1112	0	0	0	0	477	23
4	187	68	690359.0	13	0	33	13	797	42	1495	22
5	80	87	20167.0	0	0	0	9	182	36	1061	0
6	2020	1978	968182.0	56	0	20	8	841	0	2105	23
7	70	80	76735.0	2	0	0	11	489	18	1680	0
8	181	144	1960.0	2	0	40	15	928	0	79	23
9	20419393	9	468.0	6	0	6	0	0	0	2736	23
10	20423	8	17387.0	130	0	0	15	122	15	408	23
11	1436	2017	967.0	55	1099	12	0	0	0	1404	23
12	26376073	91	2666.0	60767	0	13	0	0	0	2729	23
13	27950120	157	287.0	32993	0	6	0	0	0	2442	23
14	5210	2555	5501.0	232	1169	11	0	0	0	2513	22
15	16328463	1071	14596.0	10848	0	11	0	0	0	2904	23
16	964	331	6389.0	15	2832	0	0	0	0	2905	23
17	5254	2407	3392.0	102	3082	0	0	0	0	3043	23
18	446	431	119680.0	4	0	30	12	460	0	1201	0
19	1404	0	141.0	2	12	0	9	35	7	327	23
20	16933838	21	3673.0	14682	0	0	0	0	0	2945	23
21	8619	3229	38918.0	190	50921	12	0	0	0	2614	23
22	15610695	36	82958.0	15248	0	5	0	0	0	2981	23
23	15836207	1046	27660.0	32067	0	0	0	0	0	2901	23
24	14749	1843	20076.0	584	2389	16	0	0	0	2432	0
25	191	228	24.0	5	164	17	0	0	0	881	23
26	25557131	672	7051.0	51993	0	13	0	0	0	2935	23
27	13206	964	14115.0	496	3426	16	0	0	0	2544	23
28	1641	812	3385.0	91	1493	12	0	0	0	2750	23
29	4545529	2042	9370.0	12664	13260	3	0	0	0	2800	23
...	...	...	...	...	...	...	...	...	...	...	...
545	371	4	1892.0	0	3	12	12	0	0	359	23
546	263	397	62000.0	1	0	38	0	474	0	1783	0
547	50	1	4200.0	0	1	0	0	68	0	509	0
548	549	21	245021.0	298	6	0	0	141	0	891	0
549	4067	360	4061.0	152	3402	12	0	0	0	2775	0
550	486	116	306390.0	12	0	56	6	290	0	520	23
551	331	375	4313.0	18	49	8	0	160	0	636	22
552	16779295	0	3698.0	9213	0	0	0	0	0	1812	23
553	892	355	1028.0	16	500	15	0	0	0	2039	22
554	25	4	1323.0	2	0	0	0	156	0	288	0
555	993	47	18369.0	1047	13	0	12	0	13	1286	0
556	210	200	68691.0	13	1442	0	15	104	9	1157	23
557	231	127	87.0	1	64	0	0	0	0	134	23
558	980	107	14979.0	59	4	17	0	74	0	2092	22
559	19	7	26181.0	1	20	0	15	127	15	795	23
560	29145797	800	42988.0	27906	0	9	0	0	0	3011	23
561	297	191	296.0	12	32	0	0	0	0	3070	23
562	16296072	23	973.0	20704	0	0	0	0	0	2242	22
563	15	0	70833.0	0	0	80	12	300	60	1059	0
564	3065	3517	74560.0	94	0	26	14	557	0	2469	0
565	120	71	51891.0	3	0	0	13	535	0	1687	0
566	113	226	24115.0	3	0	0	10	105	106	1060	0
567	66	84	234290.0	5	0	0	9	0	23	2112	0
568	47	16	178704.0	3	0	13	8	572	0	1835	0
569	26841	3359	90417.0	491	130876	6	0	0	0	2241	23
570	1551	138	37318.0	3119	1	15	11	159	7	305	23
571	67697733	998	19364.0	82023	0	13	0	0	0	3463	23
572	1257	363	388672.0	75	0	32	15	504	0	2358	22
573	21585872	2279	3318.0	128136	0	3	0	0	0	2463	23
574	64	80	43438.0	3	0	0	15	130	17	773	23