In [1]:
#importing pandas package
import pandas as pd
import numpy as np

#Input path where the data of the project resides
path_to_projectData = 'Data/' 

#Reading the TRAINING DATA CSV file
train_data_df = pd.read_csv(path_to_projectData + 'training_data_2_csv_UTF.csv') 

#SELECTING TOP 10 ROWS FROM THE TRAINING DATASET
train_data_df.head(10)


Out[1]:
id id_str screen_name location description url followers_count friends_count listedcount created_at favourites_count verified statuses_count lang status default_profile default_profile_image has_extended_profile name bot
0 8.160000e+17 "815745789754417152" "HoustonPokeMap" "Houston, TX" "Rare and strong PokŽmon in Houston, TX. See m... "https://t.co/dnWuDbFRkt" 1291 0 10 "Mon Jan 02 02:25:26 +0000 2017" 0 False 78554 "en" {\r "created_at": "Sun Mar 12 15:44:04 +0... True False False "Houston PokŽ Alert" 1
1 4.843621e+09 4843621225 kernyeahx Templeville town, MD, USA From late 2014 Socium Marketplace will make sh... NaN 1 349 0 2/1/2016 7:37 38 False 31 en null True False False Keri Nelson 1
2 4.303727e+09 4303727112 mattlieberisbot NaN Inspired by the smart, funny folks at @replyal... https://t.co/P1e1o0m4KC 1086 0 14 Fri Nov 20 18:53:22 +0000 2015 0 False 713 en {'retweeted': False, 'is_quote_status': False,... True False False Matt Lieber Is Bot 1
3 3.063139e+09 3063139353 sc_papers NaN NaN NaN 33 0 8 2/25/2015 20:11 0 False 676 en Construction of human anti-tetanus single-chai... True True False single cell papers 1
4 2.955142e+09 2955142070 lucarivera16 Dublin, United States Inspiring cooks everywhere since 1956. NaN 11 745 0 1/1/2015 17:44 146 False 185 en null False False False lucarivera16 1
5 8.410000e+17 8.41E+17 dantheimprover Austin, TX Just a guy trying to do good by telling everyo... NaN 1 186 0 13/03/2017 22:53 0 False 11 en Status(_api=<tweepy.api.API object at 0x101927... True False True dantheimprover 1
6 2.482835e+09 2482834658 _all_of_us_ in a machine. bot by @rubicon NaN 193 0 19 Wed May 07 22:29:25 +0000 2014 0 False 6068 en {u'contributors': None, u'truncated': False, u... False False False everything always 1
7 3.333574e+09 3333573622 KatamariItems NaN [Bot rolled up by @BeachEpisode] Cataloguing e... NaN 8227 2 89 Thu Jun 18 22:07:31 +0000 2015 26 False 2597 en {u'contributors': None, u'truncated': False, u... True False False Katamari Collection 1
8 2.996105e+09 2996105102 AutophagyPapers NaN Twitterbot for #Autophagy papers. Curated by @... NaN 275 0 17 1/25/2015 17:34 23 False 9922 en Feeding Schedule And Proteolysis Regulate Auto... False False False Autophagy Papers 1
9 3.271096e+09 3271095818 HSC_papers NaN NaN NaN 51 3 9 7/7/2015 15:23 0 False 2515 en Functional Selectivity in Cytokine Signaling R... True False False Hematopoiesis 1

In [2]:
# selecting attributes which have textual, numeric values and categorical values
train_data= train_data_df[['screen_name','location','description','url',
       'followers_count', 'friends_count', 'listedcount','created_at',
       'favourites_count','statuses_count','name']]

#Looking for NaN and NULL values in the following features of the dataframe, and using lambda as defined to replace them with 0
train_data['followers_count']=train_data['followers_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['friends_count']=train_data['friends_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['list_count']=train_data['listedcount'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
train_data['fav_count']=train_data['favourites_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)

#Dropping columns that have been renamed (the XGBOOST classifier which will be used later found discrepancy in the training and test data feature names, which made this step required)
train_data.drop('listedcount', axis=1, inplace=True)
train_data.drop('favourites_count', axis=1, inplace=True)

#y = lambda symbol: 'X' if symbol==True else 'O' if symbol==False else ' '



#Looking for NaN and NULL values in the location  feature of the dataframe, replacing the respective with "" empty strings
train_data['location']=train_data['location'].apply(lambda x: "" if type(x)==float else "" if x=='NaN'
                                                          else "" if x=='None' else x)

#Computing a new column  is_bot_location to replace the textual feature - screenname
train_data['is_bot_location'] = 0
train_data['is_bot_location'] = train_data['location'].apply(lambda x:0 if x=="" else len(x))

#Dropping the location feature in the dataframe as we have replaced it with a numerical version is_bot_location
train_data.drop('location', axis=1, inplace=True)



#Computing a new column 'is_bot_screenname' to replace the textual feature - screenname
train_data['is_bot_screenname']=0
screenname = train_data['screen_name']
flag = []
#Looking for the word bot/BOT/Bot in screenname and appending the length of the screenname, if we have a success, else 0
for x in screenname:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
train_data['is_bot_screenname']=flag
#Dropping the screen_name feature in the dataframe as we have replaced it with a numerical version is_bot_screenname
train_data.drop('screen_name', axis=1, inplace=True)



#Computing a new column 'is_bot_description' to replace the textual feature - description
train_data['is_bot_description']=0
description = train_data['description']
flag = []
#Looking for the word bot/BOT/Bot in 'description' and appending the length of the description, if we have a success, else 0
for x in description:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
train_data['is_bot_description']=flag
#Dropping the 'description' feature in the dataframe as we have replaced it with a numerical version is_bot_description
train_data.drop('description', axis=1, inplace=True)



#Computing a new column 'is_bot_name' to replace the textual feature - name
train_data['is_bot_name']=0
description = train_data['name']
flag = []
for x in description:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
train_data['is_bot_name']=flag
#Dropping the 'name' feature in the dataframe as we have replaced it with a numerical version is_bot_name
train_data.drop('name', axis=1, inplace=True)


#Computing the number of days since the profile has been created
timestamp = train_data_df['created_at']
train_data['age_of_profile']=0
profile_age=[]
from datetime import datetime
#Preparing a list of date formats which are currently present in the 'created_at' feature of our dataframe
DATE_FORMATS = ['%a %b %d %H:%M:%S +0000 %Y','"%a %b %d %H:%M:%S +0000 %Y"','%Y-%m-%d %H:%M:%S','%d/%m/%Y %H:%M' , '%m/%d/%Y %H:%M']

#Looping through each row, and each date 
#CHECK IF THE FORMAT IS IN ACCORDANCE TO THE LIST DECLARED ABOVE
#Compute the current date and find the difference with the computed 'created_at'
for i in timestamp:
    try:
        for dateformat in DATE_FORMATS:
            try:
                my_date = datetime.strptime(i,dateformat)
                break
            except ValueError:
                pass
        today = datetime.today()
        difference = today - my_date
        profile_age.append(difference.days)
    except ValueError:
        pass

train_data['age_of_profile']=profile_age
#Dropping the feature 'created_at' in the dataframe as we have replaced it with a numerical version age_of_profile
train_data.drop('created_at', axis=1, inplace=True)


#Computing the URL length
train_data['is_url']=0
train_data['url'].fillna(0,inplace=True)

description = train_data['url']
flag = []
for x in description:
    x = str(x)
    if x == 'None' or x=='0':
        flag.append(0)
    else:
        flag.append(len(x))
train_data['is_url']=flag
#Dropping the feature 'created_at' in the dataframe as we have replaced it with a numerical version age_of_profile
train_data.drop('url', axis=1, inplace=True)


#PREPARING THE X_TRAIN DATA WITH THE NEW COMPUTED TRAIN_DATA VERSION
X_train=train_data

#PREPARING Y_TRAIN WITH ONLY THE BOT COLUMN DATA FROM THE DATAFRAME
Y_train=train_data_df['bot']

#DISPLAYING ALL THE ROWS OF X_TRAIN
X_train


/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:29: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:46: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:51: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:61: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:63: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:68: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:77: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:79: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:84: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:107: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:109: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:113: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:3295: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:124: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:126: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[2]:
followers_count friends_count statuses_count list_count fav_count is_bot_location is_bot_screenname is_bot_description is_bot_name age_of_profile is_url
0 1291 0 78554 10 0 13 0 0 0 126 25
1 1 349 31 0 38 25 0 0 0 492 0
2 1086 0 713 14 0 0 15 128 18 534 23
3 33 0 676 8 0 0 0 0 0 802 0
4 11 745 185 0 146 21 0 0 0 857 0
5 1 186 11 0 0 10 0 0 0 55 0
6 193 0 6068 19 0 13 0 15 0 1096 0
7 8227 2 2597 89 26 0 0 148 0 689 0
8 275 0 9922 17 23 0 0 55 0 833 0
9 51 3 2515 9 0 0 0 0 0 670 0
10 51 1 111 12 0 11 0 0 0 1003 23
11 2 1 230 4 0 0 0 107 0 959 0
12 0 29 0 0 0 0 0 0 0 57 0
13 1 206 0 0 0 0 0 0 0 1598 0
14 0 38 0 0 0 0 0 0 0 57 0
15 109 0 16067 16 0 0 0 0 0 808 0
16 250 0 31721 25 0 0 0 0 0 1058 0
17 15 1941 406 1 319 14 0 0 0 858 0
18 190 1899 0 0 27 28 0 0 0 -57 0
19 181 0 21506 24 0 0 0 86 0 951 23
20 10175 11465 65022 199 328 0 0 0 0 1316 0
21 23 4 6230 8 4 7 0 0 0 1100 0
22 106126 0 30156 999 2 19 13 0 0 1319 23
23 2302 3 7640 161 4 5 12 136 12 558 23
24 443 2 9584 17 6 14 0 81 0 578 23
25 27 1 145 10 0 0 0 0 0 1064 0
26 191 2 5684 19 0 8 0 0 0 1079 0
27 1 35 82 1 54 0 0 0 0 90 0
28 651 0 19867 36 0 0 0 130 0 1212 0
29 37 45 191 7 0 29 0 0 0 978 0
... ... ... ... ... ... ... ... ... ... ... ...
2767 1017758 6985 13786 9473 12901 15 0 0 0 3161 22
2768 9 97 1 0 0 10 0 0 0 1172 0
2769 2000567 457 55202 9674 901 27 0 0 0 2881 23
2770 11 53 13 0 1 2 0 0 0 2388 4
2771 1599 922 11240 67 212 24 0 0 0 3017 22
2772 82 91 704 0 634 0 0 0 0 1419 0
2773 178 325 405 6 30 16 0 0 0 1708 22
2774 46 233 285 0 935 0 0 0 0 2061 0
2775 67 168 1851 0 0 6 0 0 0 2450 0
2776 147441 20889 7648 0 43 15 0 0 0 3097 0
2777 420699 108 5568 5248 11 30 0 0 0 2977 22
2778 63866 21 70 2037 40 0 0 0 0 2790 23
2779 1715 1342 1395 84 3688 17 0 0 0 2860 23
2780 5 67 2 0 0 21 0 0 0 2279 0
2781 122 88 1031 0 325 13 0 0 0 1469 0
2782 4025 487 6672 117 18 0 0 0 0 2838 22
2783 52 39 206 0 689 6 0 0 0 324 23
2784 30 72 926 4 1 27 0 0 0 1744 24
2785 41776 1158 1096 854 1264 9 0 0 0 2993 0
2786 52 323 767 2 696 12 0 0 0 2563 0
2787 972032 3072 22362 1587 1423 10 0 0 0 2829 0
2788 26 150 2 1 8 0 0 0 0 2672 0
2789 215 462 2384 16 812 12 0 0 0 3258 23
2790 64219 667 85 698 2901 14 0 0 0 2854 16
2791 310 0 3909 21 3 0 11 0 0 1243 22
2792 18998 2005 3498 425 2503 0 0 0 0 2046 0
2793 32 54 97 0 1 11 0 0 0 2270 22
2794 45044433 7451 9606 68157 24 18 0 0 0 3215 22
2795 16 64 62 1 15 23 0 0 0 1641 0
2796 22490 308 1897 1342 43 9 0 0 0 3016 22

2797 rows × 11 columns


In [3]:
#Reading the TEST data CSV file
test_data_df= pd.read_csv(path_to_projectData + 'test_data_4_students.csv')

test_data_df=test_data_df.head(575)

# Selecting Attributes which have textual, numeric values and categorical values
test_data= test_data_df[['screen_name','location','description','url',
       'followers_count', 'friends_count', 'listed_count','created_at',
       'favorites_count','statuses_count','name']]

#Looking for NaN and NULL values in the following feature of the dataframe, and using lambda as defined to replace them with 0
#df.Age.apply(lambda x: x if not pd.isnull(x) else 'Is Null value')
test_data['followers_count']=test_data['followers_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['friends_count']=test_data['friends_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['list_count']=test_data['listed_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
test_data['fav_count']=test_data['favorites_count'].apply(lambda x: 0 if x=='None' else 0 if x=='NaN' else x)
#Dropping columns that have been renamed (the XGBOOST classifier which will be used later found discrepancy in the training and test data feature names, which made this step required)
test_data.drop('listed_count', axis=1, inplace=True)
test_data.drop('favorites_count', axis=1, inplace=True)
#y = lambda symbol: 'X' if symbol==True else 'O' if symbol==False else ' '
test_data['followers_count'].fillna(0,inplace=True)
test_data['friends_count'].fillna(0,inplace=True)
test_data['statuses_count'].fillna(0,inplace=True)

#Looking for NaN and NULL values in the location  feature of the dataframe, replacing the respective with "" empty strings
test_data['location']=test_data['location'].apply(lambda x: "" if type(x)==float else "" if x=='NaN'
                                                          else "" if x=='None' else x)

#Computing a new column  is_bot_location to replace the textual feature - screenname
test_data['is_bot_location'] = 0
test_data['is_bot_location'] = test_data['location'].apply(lambda x:0 if x=="" else len(x))

#Dropping the location feature in the dataframe as we have replaced it with a numerical version is_bot_location
test_data.drop('location', axis=1, inplace=True)



#Computing a new column 'is_bot_screenname' to replace the textual feature - screenname
test_data['is_bot_screenname']=0
screenname = test_data['screen_name']
flag = []
#Looking for the word bot/BOT/Bot in screenname and appending the length of the screenname, if we have a success, else 0
for x in screenname:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
test_data['is_bot_screenname']=flag
#Dropping the screen_name feature in the dataframe as we have replaced it with a numerical version is_bot_screenname
test_data.drop('screen_name', axis=1, inplace=True)



#Computing a new column 'is_bot_description' to replace the textual feature - description
test_data['is_bot_description']=0
description = test_data['description']
flag = []
#Looking for the word bot/BOT/Bot in 'description' and appending the length of the description, if we have a success, else 0
for x in description:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
test_data['is_bot_description']=flag
#Dropping the 'description' feature in the dataframe as we have replaced it with a numerical version is_bot_description
test_data.drop('description', axis=1, inplace=True)



#Computing a new column 'is_bot_name' to replace the textual feature - name
test_data['is_bot_name']=0
description = test_data['name']
flag = []
for x in description:
    x = str(x)
    if 'bot' in x or 'BOT' in x or 'Bot' in x:
        flag.append(len(x))
    else:
        flag.append(0)
test_data['is_bot_name']=flag
#Dropping the 'name' feature in the dataframe as we have replaced it with a numerical version is_bot_name
test_data.drop('name', axis=1, inplace=True)


#Computing the number of days since the profile has been created
timestamp = test_data['created_at']
test_data['age_of_profile']=0
profile_age=[]
from datetime import datetime

#Preparing a list of date formats which are currently present in the 'created_at' feature of our dataframe
DATE_FORMATS = ['%a %b %d %H:%M:%S +0000 %Y','"%a %b %d %H:%M:%S +0000 %Y"','%Y-%m-%d %H:%M:%S','%d/%m/%Y %H:%M' , '%m/%d/%Y %H:%M']

#Looping through each row, and each date 
#CHECK IF THE FORMAT IS IN ACCORDANCE TO THE LIST DECLARED ABOVE
#Compute the current date and find the difference with the computed 'created_at'
#computing the number of days since profile creation

for i in timestamp:
    try:
        for dateformat in DATE_FORMATS:
            try:
                my_date = datetime.strptime(i,dateformat)
                break
            except ValueError:
                pass
        today = datetime.today()
        difference = today - my_date
        profile_age.append(difference.days)
    except ValueError:
        pass

test_data['age_of_profile']=profile_age
#Dropping the feature 'created_at' in the dataframe as we have replaced it with a numerical version age_of_profile
test_data.drop('created_at', axis=1, inplace=True)


#Computing the URL length
test_data['is_url']=0
test_data['url'].fillna(0,inplace=True)

description = test_data['url']
flag = []
for x in description:
    x = str(x)
    if x == 'None' or x=='0':
        flag.append(0)
    else:
        flag.append(len(x))
test_data['is_url']=flag
#Dropping the feature 'created_at' in the dataframe as we have replaced it with a numerical version age_of_profile
test_data.drop('url', axis=1, inplace=True)

#PREPARING THE X_TEST DATA WITH THE NEW COMPUTED TRAIN_DATA VERSION
X_test=test_data

#DISPLAYING ALL THE ROWS OF X_TEST
X_test


/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:39: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:49: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:51: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:56: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:66: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:68: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:73: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:82: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:84: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:89: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:115: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:117: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:121: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:134: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[3]:
followers_count friends_count statuses_count list_count fav_count is_bot_location is_bot_screenname is_bot_description is_bot_name age_of_profile is_url
0 4466 1295 3036.0 111 1579 8 0 0 0 1216 23
1 295 1016 618.0 10 300 15 0 0 0 1178 0
2 1001678 3017 3329.0 14 13040 0 0 0 0 1723 23
3 445 487 46.0 17 1112 0 0 0 0 477 23
4 187 68 690359.0 13 0 33 13 797 42 1495 22
5 80 87 20167.0 0 0 0 9 182 36 1061 0
6 2020 1978 968182.0 56 0 20 8 841 0 2105 23
7 70 80 76735.0 2 0 0 11 489 18 1680 0
8 181 144 1960.0 2 0 40 15 928 0 79 23
9 20419393 9 468.0 6 0 6 0 0 0 2736 23
10 20423 8 17387.0 130 0 0 15 122 15 408 23
11 1436 2017 967.0 55 1099 12 0 0 0 1404 23
12 26376073 91 2666.0 60767 0 13 0 0 0 2729 23
13 27950120 157 287.0 32993 0 6 0 0 0 2442 23
14 5210 2555 5501.0 232 1169 11 0 0 0 2513 22
15 16328463 1071 14596.0 10848 0 11 0 0 0 2904 23
16 964 331 6389.0 15 2832 0 0 0 0 2905 23
17 5254 2407 3392.0 102 3082 0 0 0 0 3043 23
18 446 431 119680.0 4 0 30 12 460 0 1201 0
19 1404 0 141.0 2 12 0 9 35 7 327 23
20 16933838 21 3673.0 14682 0 0 0 0 0 2945 23
21 8619 3229 38918.0 190 50921 12 0 0 0 2614 23
22 15610695 36 82958.0 15248 0 5 0 0 0 2981 23
23 15836207 1046 27660.0 32067 0 0 0 0 0 2901 23
24 14749 1843 20076.0 584 2389 16 0 0 0 2432 0
25 191 228 24.0 5 164 17 0 0 0 881 23
26 25557131 672 7051.0 51993 0 13 0 0 0 2935 23
27 13206 964 14115.0 496 3426 16 0 0 0 2544 23
28 1641 812 3385.0 91 1493 12 0 0 0 2750 23
29 4545529 2042 9370.0 12664 13260 3 0 0 0 2800 23
... ... ... ... ... ... ... ... ... ... ... ...
545 371 4 1892.0 0 3 12 12 0 0 359 23
546 263 397 62000.0 1 0 38 0 474 0 1783 0
547 50 1 4200.0 0 1 0 0 68 0 509 0
548 549 21 245021.0 298 6 0 0 141 0 891 0
549 4067 360 4061.0 152 3402 12 0 0 0 2775 0
550 486 116 306390.0 12 0 56 6 290 0 520 23
551 331 375 4313.0 18 49 8 0 160 0 636 22
552 16779295 0 3698.0 9213 0 0 0 0 0 1812 23
553 892 355 1028.0 16 500 15 0 0 0 2039 22
554 25 4 1323.0 2 0 0 0 156 0 288 0
555 993 47 18369.0 1047 13 0 12 0 13 1286 0
556 210 200 68691.0 13 1442 0 15 104 9 1157 23
557 231 127 87.0 1 64 0 0 0 0 134 23
558 980 107 14979.0 59 4 17 0 74 0 2092 22
559 19 7 26181.0 1 20 0 15 127 15 795 23
560 29145797 800 42988.0 27906 0 9 0 0 0 3011 23
561 297 191 296.0 12 32 0 0 0 0 3070 23
562 16296072 23 973.0 20704 0 0 0 0 0 2242 22
563 15 0 70833.0 0 0 80 12 300 60 1059 0
564 3065 3517 74560.0 94 0 26 14 557 0 2469 0
565 120 71 51891.0 3 0 0 13 535 0 1687 0
566 113 226 24115.0 3 0 0 10 105 106 1060 0
567 66 84 234290.0 5 0 0 9 0 23 2112 0
568 47 16 178704.0 3 0 13 8 572 0 1835 0
569 26841 3359 90417.0 491 130876 6 0 0 0 2241 23
570 1551 138 37318.0 3119 1 15 11 159 7 305 23
571 67697733 998 19364.0 82023 0 13 0 0 0 3463 23
572 1257 363 388672.0 75 0 32 15 504 0 2358 22
573 21585872 2279 3318.0 128136 0 3 0 0 0 2463 23
574 64 80 43438.0 3 0 0 15 130 17 773 23

575 rows × 11 columns


In [4]:
#IMPORTING THE XGBOOST CLASSIFIER TO PREDICT OUR TWITTER PROFILES
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np

# CONVERTING THE X_TRAIN AND Y_TRAIN ELEMENTS INTO FLOAT32
X_train = X_train.astype('float32')
X_test=X_test.astype('float32')

# Prepare the inputs for the model
xgb= xgb.XGBClassifier(learning_rate = 0.03,
 n_estimators= 300,
 max_depth= 3,
 min_child_weight= 2,
 subsample=0.4,
 colsample_bytree=0.2,
 objective= 'binary:logistic',
 nthread= 0,
 scale_pos_weight=1)
clf = xgb.fit(X_train,Y_train)

predictions = clf.predict(X_test)
predictions

test_data['bot'] = np.array(predictions)

result = pd.DataFrame(test_data_df['id'])

result  = result.astype(np.int64)
result['Bot'] = test_data['bot']
result
print (result['id'].dtype)
print (result['Bot'].dtype)
print (result)

#EXPORTING RESULTS
result.to_csv('output_xgb_0508_2.csv',  index=False)


/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
int64
int64
                     id  Bot
0            2281292622    0
1            2344040251    0
2             765871267    0
3            4772373433    0
4            1324548560    1
5            2561341789    1
6             347810134    1
7             856303860    1
8    832875000000000000    1
9              88856792    0
10   713557000000000000    1
11           1566746503    0
12             90420314    0
13            184910040    0
14            157690631    0
15             42420346    0
16             42382447    0
17             43993280    0
18           2305236733    1
19   742794000000000000    1
20             31348594    0
21            122085859    0
22             23573083    0
23             43152482    0
24            188857501    0
25           2911272579    0
26             35094637    0
27            146252766    0
28             85430866    0
29             55117855    0
..                  ...  ...
545  731201000000000000    1
546           612754791    1
547          4493562022    1
548          2897136909    1
549            78956001    0
550          4462343293    1
551          3830053332    1
552           586671909    0
553           332888068    0
554  756937000000000000    1
555          2163813157    1
556          2602312513    1
557  813000000000000000    0
558           355883433    1
559          3229506502    1
560            20322929    0
561            16712547    0
562           268439864    0
563          2566951536    1
564           174632702    1
565           843270408    1
566          2564439320    1
567           342737458    1
568           566078011    1
569           268809577    0
570  750999000000000000    1
571            10228272    0
572           218833868    1
573           176566242    0
574          3119554528    1

[575 rows x 2 columns]
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [5]:
# ##### Importing the RandomForestClassifier and predicting whether the profile is a bot or not using it
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import roc_auc_score

# rfc= RandomForestClassifier(n_estimators=200,oob_score= True,n_jobs=-1,random_state=42,max_features="auto"
#                             ,min_samples_leaf=1,max_depth=15)
# rfc.fit(X_train,Y_train)
# rfc_predictions=rfc.predict(X_test)
# rfc_predictions

# test_data['bot'] = np.array(rfc_predictions)

# result = pd.DataFrame(test_data_df['id'])

# result  = result.astype(np.int64)
# result['bot'] = test_data['bot']
# result
# print (result['id'].dtype)
# print (result['bot'].dtype)
# print (result)

# result.to_csv('output_rfc_new.csv',  index=False)

# # # # #Printing the Confusion Matrix and Classification Report for RandomForestClassifier
# # # # #print
# # # # #print(confusion_matrix(Y_test,rfc_predictions))
# # # # #print('\n')
# # # # #print(classification_report(Y_test,rfc_predictions))
# # # # #print("accuracy score: {} ".format(accuracy_score(Y_test, rfc_predictions)))  
# # # # #print("AUC: {}".format(roc_auc_score(Y_test,rfc_predictions)))

In [6]:
from sklearn.cross_validation import cross_val_score
accuracyScore = cross_val_score(xgb,X_train, Y_train, cv=5, scoring='accuracy')
print("Accuracy = ",accuracyScore.mean())


('Accuracy = ', 0.90882560212245578)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

X_train,X_test,Y_train,Y_test=train_test_split(X_train,Y_train,test_size=0.15,random_state=50)
rfc= RandomForestClassifier(n_estimators=200,oob_score= True,n_jobs=-1,random_state=50,max_features="auto"
                            ,min_samples_leaf=1,max_depth=15)
rfc.fit(X_train,Y_train)
rfc_predictions=rfc.predict(X_test)
rfc_predictions
#Printing the Confusion Matrix and Classification Report for RandomForestClassifier
#print
print(confusion_matrix(Y_test,rfc_predictions))
print('\n')
print(classification_report(Y_test,rfc_predictions))
print("accuracy score: {} ".format(accuracy_score(Y_test, rfc_predictions)))  
print("AUC: {}".format(roc_auc_score(Y_test,rfc_predictions)))


[[210  13]
 [ 21 176]]


             precision    recall  f1-score   support

          0       0.91      0.94      0.93       223
          1       0.93      0.89      0.91       197

avg / total       0.92      0.92      0.92       420

accuracy score: 0.919047619048 
AUC: 0.917552525551

In [ ]: