In [1]:
import os
import couchdb
from lib.genderComputer.genderComputer import GenderComputer
In [2]:
server = couchdb.Server(url='http://127.0.0.1:15984/')
db = server['tweets']
gc = GenderComputer(os.path.abspath('./data/nameLists'))
In [8]:
date_list = []
for row in db.view('_design/analytics/_view/conversation-date-breakdown', reduce=True, group=True):
date_list.append(row.key)
print(date_list)
In [63]:
from collections import Counter
view_data = []
for row in db.view('_design/analytics/_view/tweets-victoria',startkey="2017/3/6",endkey="2017/3/9"):
view_data.append(row.value)
In [64]:
len(view_data)
Out[64]:
In [65]:
try:
hashtags = server.create["twitter-hashtags"]
except:
hashtags = server["twitter-hashtags"]
hashtag_count = Counter()
for row in view_data:
hashtag_count.update(row["hashtags"])
for tag in hashtag_count.most_common():
doc = hashtags.get(tag[0]) # tag[0] -> hashtag, tag[1] -> frequency
if doc is None:
data = {}
data["_id"] = tag[0].replace('\u','') # use word as an id
data["hashtag"] = tag[0].replace('\u','')
data["count"] = tag[1]
else:
data = doc
data["count"] = data["count"] + tag[1]
hashtags.save(data)
In [66]:
texts = []
users = []
for row in view_data:
text = {}
text["text"] = row["text"]
text["sentiment"] = row["sentiment"]
texts.append(text)
user = row["user"]
try:
gender = gc.resolveGender(user["name"], None)
user["gender"] = gender
except:
continue
users.append(user)
In [67]:
print("text",len(texts)," user", len(users))
In [68]:
import re
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
In [69]:
## Save Terms Frequency
import HTMLParser
from collections import Counter
from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
count_all = Counter()
html_parser = HTMLParser.HTMLParser()
emoji_pattern = re.compile(
u"(\ud83d[\ude00-\ude4f])|" # emoticons
u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2)
u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2)
u"(\ud83d[\ude80-\udeff])|" # transport & map symbols
u"(\ud83c[\udde0-\uddff])" # flags (iOS)
"+", flags=re.UNICODE)
for text in texts:
cleanText = re.sub(r"http\S+", "", text['text'])
cleanText = html_parser.unescape(cleanText)
cleanText = emoji_pattern.sub(r'', cleanText)
terms_stop = [term for term in preprocess(cleanText) if term not in stop]
count_all.update(terms_stop)
try:
words = server.create["twitter-words"]
except:
words = server["twitter-words"]
for num in count_all.most_common():
doc = words.get(num[0]) # num[0] -> word, num[1] -> frequency
try:
if doc is None:
data = {}
word_text = num[0].decode("utf8").encode('ascii','ignore') # make sure we don't save unsafe character
data["_id"] = word_text # use word as an id
data["word"] = word_text
data["count"] = num[1]
else:
data = doc
data["count"] = data["count"] + num[1]
words.save(data)
except:
continue
In [70]:
#save user data
# try create user db
try:
user = server.create["twitter-users"]
except:
user = server["twitter-users"]
for row in users:
id = row["id"]
doc = user.get(str(id))
if doc is None:
row["_id"] = str(row["id"])
user.save(row)
In [23]:
"☕".decode("utf8").encode('ascii','ignore') == ""
Out[23]:
In [4]:
import datetime
today = datetime.date.today()
today = today.strftime('%Y/%-m/%-d')
print(today)
In [ ]: