In [1]:
import pandas as pd;
import numpy as np;

sarcastic_tweets = pd.read_csv('twitDB_sarcasm.csv', header=None, sep='\n')
regular_tweets = pd.read_csv('twitDB_regular.csv', header=None, sep='\n')

In [2]:
len(sarcastic_tweets)


Out[2]:
150336

In [3]:
len(regular_tweets)


Out[3]:
330692

In [4]:
from nltk.corpus import stopwords

In [ ]:
sarcastic_tweets['type']='sarcastic';
regular_tweets['type']='genuine';

In [ ]:
#  88  88 8888o. 88 .o8888 8888o. .o88o. 888888o.    888888o. .o88o. 8888o. 888888 88     
#  88  88 88  88 88 88     88  88 88  88 88 88 88    88 88 88 88  88 88  88 88     88     
#  88  88 88  88 88 88  88 8888Y' 888888 88 88 88    88 88 88 88  88 88  88 8888   88     
#  88  88 88  88 88 88  88 88  88 88  88 88 88 88    88 88 88 88  88 88  88 88     88     
#  'Y88Y' 88  88 88 'Y8888 88  88 88  88 88 88 88    88 88 88 'Y88Y' 8888Y' 888888 888888 

import re
from bs4 import BeautifulSoup
# Cleaning tweets, courtesy of Kaggle
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review,"lxml").get_text()
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z#]", " ", raw_review) 
    #
    letters_only = letters_only.replace('sarcasm','')
    
#     this may not work
    letters_only = re.sub(r"[rR][tT] ", "", letters_only)
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   


# Get the number of reviews based on the dataframe column size
num_reviews = sarcastic_tweets[0].size
# Initialize an empty list to hold the clean reviews
clean_train_tweets_sarcastic = []
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_tweets_sarcastic.append( review_to_words( sarcastic_tweets[0][i] ) )
    
# Get the number of reviews based on the dataframe column size
num_reviews = regular_tweets[0].size
# Initialize an empty list to hold the clean reviews
clean_train_tweets_regular = []
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_tweets_regular.append( review_to_words( regular_tweets[0][i] ) )


/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'https://t.co/hphPwdcRik'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'https://t.co/dWo5WpBhxW'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'https://t.co/5iubyNBFhk'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/TTBaZ5a3ie'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/uDjhjT81hl'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:198: UserWarning: "b'.'" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/LBB7m2MKy4'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/hPs5g5BBuA'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/zTvU7Q7zT9'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/SB4FgL45Hd'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/KMiF7bBTcm'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/5gxispCObq'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/P8kRLI3FMq'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/Io3EXmALGT'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'https://t.co/MCfUnn9hh9'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/joseph/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:207: UserWarning: "b'http://t.co/Cl7X5iywkm'" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)

In [ ]:
sarcastic_tweets_parsed = [n.replace('sarcasm','') for n in  clean_train_tweets_sarcastic]
# sarcastic_tweets_parsed = [n.replace]

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features_sarcastic = vectorizer.fit_transform(clean_train_tweets_sarcastic)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features_sarcastic = train_data_features_sarcastic.toarray()

In [ ]:
train_data_features_sarcastic.nonzero()

In [ ]:
len(vectorizer.get_feature_names())

In [ ]:
from nltk.stem.porter import *
ps = PorterStemmer()

sarcastic_tweets_stemmed=[];

for tweet in range(0,len(sarcastic_tweets_parsed)):
    newTweet=[];
    for word in sarcastic_tweets_parsed[tweet].split(" "):
        newTweet.append(ps.stem(word));
    sarcastic_tweets_stemmed.append(" ".join(newTweet));

In [ ]:
sarcastic_tweets_stemmed

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features_sarcastic = vectorizer.fit_transform(sarcastic_tweets_stemmed)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features_sarcastic = train_data_features_sarcastic.toarray()
vectorizer.get_feature_names()

In [ ]:
len(vectorizer.get_feature_names())

In [ ]:
from sklearn import tree
X = [[0, 0], [1, 1]]
Y = [0, 1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [ ]:
BeautifulSoup(" #endfathersday ","lxml").get_text()