In [1]:
import os
import couchdb
from lib.genderComputer.genderComputer import GenderComputer

In [2]:
server = couchdb.Server(url='http://127.0.0.1:15984/')
db = server['tweets']
gc = GenderComputer(os.path.abspath('./data/nameLists'))

In [8]:
date_list = []
for row in db.view('_design/analytics/_view/conversation-date-breakdown', reduce=True, group=True):
    date_list.append(row.key)
    
print(date_list)


[u'2017/3/10', u'2017/3/11', u'2017/3/12', u'2017/3/13', u'2017/3/14', u'2017/3/15', u'2017/3/16', u'2017/3/17', u'2017/3/18', u'2017/3/19', u'2017/3/20', u'2017/3/21', u'2017/3/22', u'2017/3/23', u'2017/3/24', u'2017/3/25', u'2017/3/26', u'2017/3/27', u'2017/3/28', u'2017/3/29', u'2017/3/30', u'2017/3/6', u'2017/3/7', u'2017/3/8', u'2017/3/9', u'2017/4/1', u'2017/4/2', u'2017/4/3', u'2017/4/4', u'2017/4/5', u'2017/4/6', u'2017/4/7', u'2017/4/8']

In [63]:
from collections import Counter
view_data = []
for row in db.view('_design/analytics/_view/tweets-victoria',startkey="2017/3/6",endkey="2017/3/9"):
    view_data.append(row.value)

In [64]:
len(view_data)


Out[64]:
254

In [65]:
try:
    hashtags = server.create["twitter-hashtags"]
except:
    hashtags = server["twitter-hashtags"]

hashtag_count = Counter()

for row in view_data:
    hashtag_count.update(row["hashtags"])

for tag in hashtag_count.most_common():
    doc = hashtags.get(tag[0]) # tag[0] -> hashtag, tag[1] -> frequency
    if doc is None:
        data = {}
        data["_id"] = tag[0].replace('\u','') # use word as an id
        data["hashtag"] = tag[0].replace('\u','')
        data["count"] = tag[1]
    else:
        data = doc
        data["count"] = data["count"] + tag[1]
    
    hashtags.save(data)

In [66]:
texts = []
users = []
for row in view_data:
    text = {}
    text["text"] = row["text"]
    text["sentiment"] = row["sentiment"]
    texts.append(text)
    user = row["user"]
    try:
        gender = gc.resolveGender(user["name"], None)
        user["gender"] = gender
    except:
        continue
    users.append(user)

In [67]:
print("text",len(texts)," user", len(users))


('text', 254, ' user', 254)

In [68]:
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [69]:
## Save Terms Frequency
import HTMLParser
from collections import Counter
from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
count_all = Counter()
html_parser = HTMLParser.HTMLParser()
emoji_pattern = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
    "+", flags=re.UNICODE)
for text in texts:
    cleanText = re.sub(r"http\S+", "", text['text'])
    cleanText = html_parser.unescape(cleanText)
    cleanText = emoji_pattern.sub(r'', cleanText)
    terms_stop = [term for term in preprocess(cleanText) if term not in stop]
    count_all.update(terms_stop)

try:
    words = server.create["twitter-words"]
except:
    words = server["twitter-words"]
    
for num in count_all.most_common():
    doc = words.get(num[0]) # num[0] -> word, num[1] -> frequency
    try:
        if doc is None:
            data = {}
            word_text = num[0].decode("utf8").encode('ascii','ignore') # make sure we don't save unsafe character
            data["_id"] = word_text # use word as an id
            data["word"] = word_text
            data["count"] = num[1]
        else:
            data = doc
            data["count"] = data["count"] + num[1]
        words.save(data)
    except:
        continue

In [70]:
#save user data
# try create user db
try:
    user = server.create["twitter-users"]
except:
    user = server["twitter-users"]
    
for row in users:
    id = row["id"]
    doc = user.get(str(id))
    if doc is None:
        row["_id"] = str(row["id"])
        user.save(row)

In [23]:
"☕".decode("utf8").encode('ascii','ignore') == ""


Out[23]:
True

In [4]:
import datetime
today = datetime.date.today()
today = today.strftime('%Y/%-m/%-d')
print(today)


2017/5/9

In [ ]: