In [1]:
! sudo pip install pandas
! sudo pip install matplotlib
! sudo apt-get -y install python3-tk


Requirement already satisfied (use --upgrade to upgrade): pandas in /usr/local/lib/python3.4/dist-packages
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0 in /usr/lib/python3/dist-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2 in /usr/local/lib/python3.4/dist-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): pytz>=2011k in /root/.local/lib/python3.4/site-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): six>=1.5 in /usr/local/lib/python3.4/dist-packages (from python-dateutil>=2->pandas)
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
Requirement already satisfied (use --upgrade to upgrade): matplotlib in /usr/local/lib/python3.4/dist-packages
Requirement already satisfied (use --upgrade to upgrade): pyparsing!=2.0.0,!=2.0.4,!=2.1.2,>=1.5.6 in /usr/local/lib/python3.4/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.6 in /usr/lib/python3/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil in /usr/local/lib/python3.4/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): cycler in /usr/local/lib/python3.4/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): pytz in /root/.local/lib/python3.4/site-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): six>=1.5 in /usr/local/lib/python3.4/dist-packages (from python-dateutil->matplotlib)
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following extra packages will be installed:
  blt libtcl8.6 libtk8.6 libxft2 libxss1
Suggested packages:
  blt-demo tcl8.6 tk8.6 tix python3-tk-dbg
The following NEW packages will be installed:
  blt libtcl8.6 libtk8.6 libxft2 libxss1 python3-tk
0 upgraded, 6 newly installed, 0 to remove and 0 not upgraded.
Need to get 2,151 kB of archives.
After this operation, 8,963 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu/ trusty/main libtcl8.6 amd64 8.6.1-4ubuntu1 [841 kB]
Get:2 http://archive.ubuntu.com/ubuntu/ trusty/main libxft2 amd64 2.3.1-2 [36.2 kB]
Get:3 http://archive.ubuntu.com/ubuntu/ trusty/main libxss1 amd64 1:1.2.2-1 [8,582 B]
Get:4 http://archive.ubuntu.com/ubuntu/ trusty/main libtk8.6 amd64 8.6.1-3ubuntu2 [689 kB]
Get:5 http://archive.ubuntu.com/ubuntu/ trusty/main blt amd64 2.4z-7ubuntu2 [553 kB]
Get:6 http://archive.ubuntu.com/ubuntu/ trusty-updates/main python3-tk amd64 3.4.3-1~14.04.2 [23.5 kB]
Fetched 2,151 kB in 5s (420 kB/s)
Selecting previously unselected package libtcl8.6:amd64.
(Reading database ... 34611 files and directories currently installed.)
Preparing to unpack .../libtcl8.6_8.6.1-4ubuntu1_amd64.deb ...
Unpacking libtcl8.6:amd64 (8.6.1-4ubuntu1) ...
Selecting previously unselected package libxft2:amd64.
Preparing to unpack .../libxft2_2.3.1-2_amd64.deb ...
Unpacking libxft2:amd64 (2.3.1-2) ...
Selecting previously unselected package libxss1:amd64.
Preparing to unpack .../libxss1_1%3a1.2.2-1_amd64.deb ...
Unpacking libxss1:amd64 (1:1.2.2-1) ...
Selecting previously unselected package libtk8.6:amd64.
Preparing to unpack .../libtk8.6_8.6.1-3ubuntu2_amd64.deb ...
Unpacking libtk8.6:amd64 (8.6.1-3ubuntu2) ...
Selecting previously unselected package blt.
Preparing to unpack .../blt_2.4z-7ubuntu2_amd64.deb ...
Unpacking blt (2.4z-7ubuntu2) ...
Selecting previously unselected package python3-tk.
Preparing to unpack .../python3-tk_3.4.3-1~14.04.2_amd64.deb ...
Unpacking python3-tk (3.4.3-1~14.04.2) ...
Setting up libtcl8.6:amd64 (8.6.1-4ubuntu1) ...
Setting up libxft2:amd64 (2.3.1-2) ...
Setting up libxss1:amd64 (1:1.2.2-1) ...
Setting up libtk8.6:amd64 (8.6.1-3ubuntu2) ...
Setting up blt (2.4z-7ubuntu2) ...
Setting up python3-tk (3.4.3-1~14.04.2) ...
Processing triggers for libc-bin (2.19-0ubuntu6.9) ...

In [2]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [3]:
df = pd.read_csv("./data/enwiki.draft_quality.75_not_OK_sample.censored.tsv", sep="\t")

In [4]:
df.head()


Out[4]:
page_title rev_id creation_timestamp archived draft_quality censored_text
0 Government_Achuthan_girls_hss 688249460 20151030165831 1 spam 125 years ago APPU NEDUNGADI STARTED THIS SCH...
1 Spiromax_EDMS 731688963 20160726220726 1 spam Spiromax is a British technology media and com...
2 Steph_Curry_UA_2 693824273 20151205035603 1 spam The Steph Curry UA two Shoes are a hot-selling...
3 Valletta_Cruise_Port 722151405 20160526085302 1 spam ''Valletta Cruise Port plc''' is a private com...
4 RWG_Mobile 731355950 20160724204124 1 spam [[File:RWGmobile.png|thumb|RWG Mobile logo]]\n...

In [5]:
df["len_text"] = df["censored_text"].apply(len)

In [6]:
df.head()


Out[6]:
page_title rev_id creation_timestamp archived draft_quality censored_text len_text
0 Government_Achuthan_girls_hss 688249460 20151030165831 1 spam 125 years ago APPU NEDUNGADI STARTED THIS SCH... 195
1 Spiromax_EDMS 731688963 20160726220726 1 spam Spiromax is a British technology media and com... 989
2 Steph_Curry_UA_2 693824273 20151205035603 1 spam The Steph Curry UA two Shoes are a hot-selling... 543
3 Valletta_Cruise_Port 722151405 20160526085302 1 spam ''Valletta Cruise Port plc''' is a private com... 2980
4 RWG_Mobile 731355950 20160724204124 1 spam [[File:RWGmobile.png|thumb|RWG Mobile logo]]\n... 2099

In [7]:
find_urls = lambda x: re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)

In [8]:
df["urls"] = df["censored_text"].apply(find_urls)

In [9]:
df.head()


Out[9]:
page_title rev_id creation_timestamp archived draft_quality censored_text len_text urls
0 Government_Achuthan_girls_hss 688249460 20151030165831 1 spam 125 years ago APPU NEDUNGADI STARTED THIS SCH... 195 [https://www.facebook.com/groups/4806262819778...
1 Spiromax_EDMS 731688963 20160726220726 1 spam Spiromax is a British technology media and com... 989 []
2 Steph_Curry_UA_2 693824273 20151205035603 1 spam The Steph Curry UA two Shoes are a hot-selling... 543 []
3 Valletta_Cruise_Port 722151405 20160526085302 1 spam ''Valletta Cruise Port plc''' is a private com... 2980 [https://www.youtube.com/watch?v=FMThbEG95WA</...
4 RWG_Mobile 731355950 20160724204124 1 spam [[File:RWGmobile.png|thumb|RWG Mobile logo]]\n... 2099 [https://www.rwgmobile.wales/about-rwg/</ref>....

In [10]:
df["draft_quality"].unique()


Out[10]:
array(['spam', 'attack', 'vandalism'], dtype=object)

In [11]:
df["count_urls"] = df["urls"].apply(len)

In [12]:
df.head()


Out[12]:
page_title rev_id creation_timestamp archived draft_quality censored_text len_text urls count_urls
0 Government_Achuthan_girls_hss 688249460 20151030165831 1 spam 125 years ago APPU NEDUNGADI STARTED THIS SCH... 195 [https://www.facebook.com/groups/4806262819778... 1
1 Spiromax_EDMS 731688963 20160726220726 1 spam Spiromax is a British technology media and com... 989 [] 0
2 Steph_Curry_UA_2 693824273 20151205035603 1 spam The Steph Curry UA two Shoes are a hot-selling... 543 [] 0
3 Valletta_Cruise_Port 722151405 20160526085302 1 spam ''Valletta Cruise Port plc''' is a private com... 2980 [https://www.youtube.com/watch?v=FMThbEG95WA</... 3
4 RWG_Mobile 731355950 20160724204124 1 spam [[File:RWGmobile.png|thumb|RWG Mobile logo]]\n... 2099 [https://www.rwgmobile.wales/about-rwg/</ref>.... 3

In [13]:
spams = list(df[df.draft_quality == "spam"]["count_urls"])
attacks = list(df[df.draft_quality == "attack"]["count_urls"])
vandalisms = list(df[df.draft_quality == "vandalism"]["count_urls"])

In [14]:
data_to_plot = [spams, attacks, vandalisms]

In [21]:
# Create a figure instance
fig = plt.figure()

# Create an axes instance
ax = fig.add_subplot(111)

# Create the boxplot
bp = ax.boxplot(data_to_plot)

# Display the figure
fig.show()


/usr/local/lib/python3.4/dist-packages/matplotlib/figure.py:397: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "

In [22]:
df["censored_text"]


Out[22]:
0     125 years ago APPU NEDUNGADI  STARTED THIS SCH...
1     Spiromax is a British technology media and com...
2     The Steph Curry UA two Shoes are a hot-selling...
3     ''Valletta Cruise Port plc''' is a private com...
4     [[File:RWGmobile.png|thumb|RWG Mobile logo]]\n...
5     Transfer your files from anywhere to anywhere\...
6     {{Infobox organization\n| name   = The HeroesT...
7     Victor Jerome Beasley Jr (born July 18, 1981),...
8     ''<big>suresh Kumar Mishra 'Uratrupt'</big>'''...
9      == ABOUT KAMRUL HASAN KOFIL (KHK) ==\n\n* Ful...
10    [http://Localizer.co Localizer.co] is a softwa...
11    [http://www.justjoce.com/faq.html#0 Ginger Ros...
12    <!-- Don't mess with this line! -->{{New unrev...
13    Trailer Park Boys Mobile Game\n\nComing Spring...
14    Tana Monceau is a youtube star, with over 1,34...
15    {{AFC submission|t||ts=20160324040906|u=Yongsh...
16    This book was written by a group called DICE w...
17    ''[http://www.baselinemag.com/ Baseline]''' (b...
18    Emerald marketing research is a full service c...
19    Prawns & Fishes is the chain of restaurants st...
20    RICHES Mosaic Interface\u2122 https://richesmi...
21    ''History'''\n[http://middleeastbank.ir/ Middl...
22    {{User LEGWAN}}\n<!-- EDIT BELOW THIS LINE -->...
23    Copyblogger\nEmail Marketing\nHOW TO PUSH SEND...
24    Xara Skin Clinic Sydney is Located in the hear...
25    Corin Scott is a complete nutter. He should de...
26    An immigrant from a very poor unstable part of...
27    Sean Tynan was born on the 2nd of July 1999 in...
28    Jesse Tyler Ridgway (Born: September 29, 1992)...
29    Jacob Sartorius is a VERY IMPORTANT person who...
                            ...                        
45    <!-- Don't mess with this line! -->{{New unrev...
46    Styles Moody lives in San Diego California. He...
47                        he is weird and is bad in bed
48     == Daddy Jarvis ==\nDaddy Jarvis is an admin ...
49    [[File:Jervie Calibo "Calibuto".jpeg|thumbnail...
50                                          {{db-band}}
51    Chief Fresha Lutenoff of Sweden is famous for ...
52    It is a child\n\nWorld renowned for his memes\...
53                                                    w
54    Christopher Franz Schultchen, (born <!-- Censo...
55    Yo cunts liking you butts\n\nlike sunny weathe...
56    The Leafy Greens Movement was founded in the y...
57                   a fuc*ing se*y motherfuc*er  actor
58    a few useful tips to help you be a much better...
59    Andrew Artiem Sauk is an Actor. He was born in...
60    Asteroid Masters is a 1997 american action/adv...
61    Krider is a LOL account of the best player in ...
62    Irny was founded on 1777.It is now a 9 island ...
63    \n Name : Bhavik Bharambhe\n age : 14\n countr...
64            #REDIRECT [[Lolzz]]\n\n{{redr|from move}}
65    {{Infobox person\n| name        = Damon Colema...
66    The Canis Careyus was a fierce organism that l...
67    Selimentosanda is a very good Japanese watch c...
68    charlie is jesus jesus is charlie charlie is j...
69                                      ˢᵘᶜᵏ ᵐʸ ᵃˢˢ ʳᵉˣ
70    Nine Ball Island is a small island owned by Je...
71    Harrison nuti, outstanding human being would b...
72    In the island of Cyprus, Nicosian you call a r...
73    {{multiple issues|\n{{refimprove|date=December...
74     Foniasaurus \nFrom Wikipedia, the free encycl...
Name: censored_text, dtype: object

In [40]:
w = ["4U","Claims you are a winner","For instant access","Accept credit cards","Claims you registered with Some Kind of Partner","For just $ (some amt)","Act now! Don’t hesitate!","Click below","Free access","Additional income","Click here link","Free cell phone","Addresses on CD","Click to remove","Free consultation","All natural","Click to remove mailto","Free DVD","Amazing","Compare rates","Free grant money","Apply Online","Compete for your business","Free hosting","As seen on","Confidentially on all orders","Free installation","Auto email removal","Congratulations","Free investment","Avoid bankruptcy","Consolidate debt and credit","Free leads","Be amazed","Copy accurately","Free membership","Be your own boss","Copy DVDs","Free money","Being a member","Credit bureaus","Free offer","Big bucks","Credit card offers","Free preview","Bill 1618","Cures baldness","Free priority mail","Billing address","Dear email","Free quote","Billion dollars","Dear friend","Free sample","Brand new pager","Dear somebody","Free trial","Bulk email","Different reply to","Free website","Buy direct","Dig up dirt on friends","Full refund","Buying judgments","Direct email","Get It Now","Cable converter","Direct marketing","Get paid","Call free","Discusses search engine listings","Get started now","Call now","Do it today","Gift certificate","Calling creditors","Don’t delete","Great offer","Can’t live without","Drastically reduced","Guarantee","Cancel at any time","Earn per week","Have you been turned down?","Cannot be combined with any other offer","Easy terms","Hidden assets","Cash bonus","Eliminate bad credit","Home employment","Cashcashcash","Email harvest","Human growth hormone","Casino","Email marketing","If only it were that easy","Cell phone cancer scam","Expect to earn","In accordance with laws","Cents on the dollar","Fantastic deal","Increase sales","Check or money order","Fast Viagra delivery","Increase traffic","Claims not to be selling anything","Financial freedom","Insurance","Claims to be in accordance with some spam law","Find out anything","Investment decision","Claims to be legal","For free","It's effective","Join millions of Americans","No questions asked","Reverses aging","Laser printer","No selling","Risk free","Limited time only","No strings attached","Round the world","Long distance phone offer","Not intended","S 1618","Lose weight spam","Off shore","Safeguard notice","Lower interest rates","Offer expires","Satisfaction guaranteed","Lower monthly payment","Offers coupon","Save $","Lowest price","Offers extra cash","Save big money","Luxury car","Offers free (often stolen) passwords","Save up to","Mail in order form","Once in lifetime","Score with babes","Marketing solutions","One hundred percent free","Section 301","Mass email","One hundred percent guaranteed","See for yourself","Meet singles","One time mailing","Sent in compliance","Member stuff","Online biz opportunity","Serious cash","Message contains disclaimer","Online pharmacy","Serious only","MLM","Only $","Shopping spree","Money back","Opportunity","Sign up free today","Money making","Opt in","Social security number","Month trial offer","Order now","Special promotion","More Internet traffic","Order status","Stainless steel","Mortgage rates","Orders shipped by priority mail","Stock alert","Multi level marketing","Outstanding values","Stock disclaimer statement","Name brand","Pennies a day","Stock pick","New customers only","People just leave money laying around","Stop snoring","New domain extensions","Please read","Strong buy","Nigerian","Potential earnings","Stuff on sale","No age restrictions","Print form signature","Subject to credit","No catch","Print out and fax","Supplies are limited","No claim forms","Produced and sent out","Take action now","No cost","Profits","Talks about hidden charges","No credit check","Promise you …!","Talks about prizes","No disappointment","Pure profit","Tells you it’s an ad","No experience","Real thing","Terms and conditions","No fees","Refinance home","The best rates","No gimmick","Removal instructions","The following form","No inventory","Remove in quotes","They keep your money — no refund!","No investment","Remove subject","They’re just giving it away","No medical exams","Removes wrinkles","This isn’t junk","No middleman","Reply remove subject","This isn’t spam","No obligation","Requires initial investment","University diplomas","No purchase necessary","Reserves the right","Unlimited","Unsecured credit/debt","We honor all","Will not believe your eyes","Urgent","Weekend getaway","Winner","US dollars","What are you waiting for?","Winning","Vacation offers","While supplies last","Work at home","Viagra and other drugs","While you sleep","You have been selected","Wants credit card","Who really wins?","Your income","We hate spam","Why pay more?"]

In [41]:
w = [word.lower() for word in w]

In [ ]:


In [45]:
df["nb_spam_word"] = df["censored_text"].apply(lambda x: np.sum([x.lower().count(word) for word in w] ))

In [46]:
df["freq_spam_word"] = df["nb_spam_word"] / df["len_text"]

In [47]:
df["freq_spam_word"]


Out[47]:
0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6     0.000797
7     0.000000
8     0.000000
9     0.000000
10    0.000000
11    0.000163
12    0.000000
13    0.000000
14    0.000000
15    0.000244
16    0.000000
17    0.000000
18    0.000000
19    0.000000
20    0.000000
21    0.000000
22    0.000000
23    0.004876
24    0.000000
25    0.000000
26    0.000000
27    0.000000
28    0.000027
29    0.000000
        ...   
45    0.000000
46    0.000000
47    0.000000
48    0.000000
49    0.000000
50    0.000000
51    0.000000
52    0.000000
53    0.000000
54    0.000000
55    0.000000
56    0.000000
57    0.000000
58    0.000738
59    0.000000
60    0.000000
61    0.000000
62    0.000000
63    0.000000
64    0.000000
65    0.000000
66    0.000000
67    0.000000
68    0.000000
69    0.000000
70    0.000000
71    0.000000
72    0.001115
73    0.000000
74    0.000000
Name: freq_spam_word, dtype: float64

In [ ]: