Build topic_news and topic_tweets docs

Objective: aggregate associated news and tweets for each topic into a doc

Last modified: 2017-10-23

Roadmap

  1. Build topic_news and topic_tweets docs.
  2. Re-build topic_tweets docs. Remove tweets with the same or silimar tweet_text field.

Steps


In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import sqlite3
import time
import codecs
import csv
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171012-daheng-build_topic_news_tweets_docs'

Build topic_news and topic_tweets docs


In [2]:
%%time
'''
Load in topics information pkl
'''
if 1 == 1:
    with open(config.TOPICS_LST_PKL, 'rb') as f:
        topics_lst = pickle.load(f)


CPU times: user 2.65 s, sys: 3.28 s, total: 5.93 s
Wall time: 5.93 s

In [3]:
%%time
"""
For each topic, query db
 - write news_doc into topic_news doc
 - write tweet_text into topic_tweets doc
 
All docs are placed inside config.TOPICS_DOCS_DIR
 - topic_news doc follows name convention: [topic_ind]-[topic_name].news.csv
 - topic_tweets doc follows name convention: [topic_ind]-[topic_name].tweets.csv
"""

if 0 == 1:
    '''
    Define topic_news and topic_tweets doc format
    '''
    csv.register_dialect('topics_docs_line', delimiter='\t', doublequote=True, quoting=csv.QUOTE_ALL)
    
    for topic_ind, topic in enumerate(topics_lst):
        topic_name = topic['name']
        news_native_ids_lst = topic['news_native_ids_lst']
        tweets_ids_lst = topic['tweets_ids_lst']
        
        print('{} Topic_name: {}; news_num: {}; tweets_num: {}'.format(topic_ind, topic_name, len(news_native_ids_lst), len(tweets_ids_lst)))
        
        with sqlite3.connect(config.NEWS_TWEETS_DB_FILE) as conn:
            conn.row_factory = sqlite3.Row
            cursor = conn.cursor()
            
            '''
            Write topic_news doc
            '''
            query_news = '''
            select news_title, news_collected_time, news_native_id, news_doc from news
            where news_native_id = :news_native_id;'''
            
            output_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.news.csv'.format(topic_ind, topic_name))
            with open(output_file, 'w') as f:
                print('\tWriting topic_news doc ...')
                fieldnames = ('news_native_id', 'news_collected_time', 'news_title', 'news_doc')
                writer = csv.DictWriter(f, fieldnames=fieldnames, dialect='topics_docs_line')
                writer.writeheader()

                for news_native_id in news_native_ids_lst:
                    cursor.execute(query_news, {'news_native_id': news_native_id})
                    for row in cursor.fetchall():
                        writer.writerow({'news_native_id': row['news_native_id'],
                                         'news_collected_time': row['news_collected_time'],
                                         'news_title': row['news_title'],
                                         'news_doc': row['news_doc']})
                    
            
            '''
            Write topic_tweets doc
            
            NOTE: tweets_num for each topic is much larger, should use batch query otherwise it would take hours
            '''
            batch_size = 500
                        
            output_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.tweets.csv'.format(topic_ind, topic_name))
            
            with open(output_file, 'w') as f:
                print('\tWriting topic_tweets doc ...')
                fieldnames = ('tweet_id', 'tweet_collected_time', 'tweet_text', 'news_native_id')
                writer = csv.DictWriter(f, fieldnames=fieldnames, dialect='topics_docs_line')
                writer.writeheader()
                
                '''
                Split queries into batches
                '''
                if len(tweets_ids_lst) % batch_size:
                    batch_num = len(tweets_ids_lst) // batch_size + 1
                else:
                    batch_num = len(tweets_ids_lst) // batch_size
                    
                for batch_ind in range(0, batch_num):
                    start_ind = batch_ind * batch_size
                    end_ind = start_ind + batch_size
                    
                    # build tuple argument containing tweet_native_ids in this batch
                    batch_tweets_ids_tpl = tuple(tweets_ids_lst[start_ind: end_ind])
                    
                    query_tweet = '''
                    select tweet_id, tweet_collected_time, tweet_text, news_native_id from tweets
                    where tweet_id in ({place_holder});'''.format(place_holder=','.join(['?']*len(batch_tweets_ids_tpl)))

                    cursor.execute(query_tweet, batch_tweets_ids_tpl)
                    for row in cursor.fetchall():
                        writer.writerow({'tweet_id': row['tweet_id'],
                                         'tweet_collected_time': row['tweet_collected_time'],
                                         'tweet_text': row['tweet_text'].replace('\n',' '), # keep each tweet in one line
                                         'news_native_id': row['news_native_id']})


0 Topic_name: Hillary_Clinton_email_controversy; news_num: 228; tweets_num: 860564
	Writing topic_news doc ...
	Writing topic_tweets doc ...
1 Topic_name: Iran_nuclear_deal; news_num: 406; tweets_num: 2412264
	Writing topic_news doc ...
	Writing topic_tweets doc ...
2 Topic_name: ISIS_Jihadi_John_identity_reveal; news_num: 101; tweets_num: 620121
	Writing topic_news doc ...
	Writing topic_tweets doc ...
3 Topic_name: Ukraine_cease_fire; news_num: 84; tweets_num: 603709
	Writing topic_news doc ...
	Writing topic_tweets doc ...
4 Topic_name: Egypt_free_Al_Jazeera_journalist; news_num: 50; tweets_num: 129120
	Writing topic_news doc ...
	Writing topic_tweets doc ...
5 Topic_name: Keystone_XL_Pipeline_bill; news_num: 55; tweets_num: 117692
	Writing topic_news doc ...
	Writing topic_tweets doc ...
6 Topic_name: CIA_Torture_Report; news_num: 41; tweets_num: 167362
	Writing topic_news doc ...
	Writing topic_tweets doc ...
7 Topic_name: Obama_cybersecurity_plan; news_num: 73; tweets_num: 495576
	Writing topic_news doc ...
	Writing topic_tweets doc ...
8 Topic_name: DHS_funding_issue; news_num: 45; tweets_num: 104911
	Writing topic_news doc ...
	Writing topic_tweets doc ...
9 Topic_name: US_Cuba_relationship; news_num: 235; tweets_num: 1213314
	Writing topic_news doc ...
	Writing topic_tweets doc ...
10 Topic_name: 2015_CPAC; news_num: 68; tweets_num: 289774
	Writing topic_news doc ...
	Writing topic_tweets doc ...
11 Topic_name: Iraq_free_ISIS_Tikrit; news_num: 94; tweets_num: 567544
	Writing topic_news doc ...
	Writing topic_tweets doc ...
12 Topic_name: Nigeria_Boko_Haram_terrorists; news_num: 243; tweets_num: 954810
	Writing topic_news doc ...
	Writing topic_tweets doc ...
13 Topic_name: Ferguson_unrest; news_num: 611; tweets_num: 3406303
	Writing topic_news doc ...
	Writing topic_tweets doc ...
14 Topic_name: Hong_Kong_protest; news_num: 157; tweets_num: 559130
	Writing topic_news doc ...
	Writing topic_tweets doc ...
15 Topic_name: Sony_cyberattack; news_num: 275; tweets_num: 1890966
	Writing topic_news doc ...
	Writing topic_tweets doc ...
16 Topic_name: Bill_Cosby_sexual_assault_allegation; news_num: 168; tweets_num: 636409
	Writing topic_news doc ...
	Writing topic_tweets doc ...
17 Topic_name: SpaceX_rocket_landing; news_num: 86; tweets_num: 367488
	Writing topic_news doc ...
	Writing topic_tweets doc ...
18 Topic_name: Brian_Williams_fake_story; news_num: 69; tweets_num: 475319
	Writing topic_news doc ...
	Writing topic_tweets doc ...
19 Topic_name: HSBC_tax_scandal; news_num: 28; tweets_num: 78053
	Writing topic_news doc ...
	Writing topic_tweets doc ...
20 Topic_name: David_Carr_death; news_num: 36; tweets_num: 133630
	Writing topic_news doc ...
	Writing topic_tweets doc ...
21 Topic_name: Patriots_Deflategate; news_num: 44; tweets_num: 268590
	Writing topic_news doc ...
	Writing topic_tweets doc ...
22 Topic_name: Delhi_Uber_driver_rape; news_num: 36; tweets_num: 381997
	Writing topic_news doc ...
	Writing topic_tweets doc ...
23 Topic_name: Superbug_spread; news_num: 41; tweets_num: 240141
	Writing topic_news doc ...
	Writing topic_tweets doc ...
24 Topic_name: Rudy_Giuliani_Obama_critique; news_num: 50; tweets_num: 284510
	Writing topic_news doc ...
	Writing topic_tweets doc ...
25 Topic_name: Oscar; news_num: 241; tweets_num: 1902567
	Writing topic_news doc ...
	Writing topic_tweets doc ...
26 Topic_name: Super_Bowl; news_num: 211; tweets_num: 1628441
	Writing topic_news doc ...
	Writing topic_tweets doc ...
27 Topic_name: Grammy; news_num: 99; tweets_num: 674595
	Writing topic_news doc ...
	Writing topic_tweets doc ...
28 Topic_name: Golden_Globe; news_num: 79; tweets_num: 826707
	Writing topic_news doc ...
	Writing topic_tweets doc ...
29 Topic_name: 500_million_Powerball; news_num: 79; tweets_num: 515778
	Writing topic_news doc ...
	Writing topic_tweets doc ...
30 Topic_name: Thanksgiving; news_num: 150; tweets_num: 1946168
	Writing topic_news doc ...
	Writing topic_tweets doc ...
31 Topic_name: Black_Friday_and_Cyber_Monday; news_num: 121; tweets_num: 1511633
	Writing topic_news doc ...
	Writing topic_tweets doc ...
32 Topic_name: Christmas; news_num: 237; tweets_num: 2628846
	Writing topic_news doc ...
	Writing topic_tweets doc ...
33 Topic_name: New_Year; news_num: 69; tweets_num: 1053027
	Writing topic_news doc ...
	Writing topic_tweets doc ...
34 Topic_name: Apple_Watch; news_num: 73; tweets_num: 608351
	Writing topic_news doc ...
	Writing topic_tweets doc ...
35 Topic_name: Yosemite_historic_climb; news_num: 41; tweets_num: 107481
	Writing topic_news doc ...
	Writing topic_tweets doc ...
36 Topic_name: Jon_Stewart_Daily_Show; news_num: 35; tweets_num: 246313
	Writing topic_news doc ...
	Writing topic_tweets doc ...
37 Topic_name: success_of_American_Sniper; news_num: 155; tweets_num: 740376
	Writing topic_news doc ...
	Writing topic_tweets doc ...
38 Topic_name: Ebola_virus_spread; news_num: 173; tweets_num: 888396
	Writing topic_news doc ...
	Writing topic_tweets doc ...
39 Topic_name: Indonesia_AirAsia_Flight_QZ8501_crash; news_num: 258; tweets_num: 1213037
	Writing topic_news doc ...
	Writing topic_tweets doc ...
40 Topic_name: Paris_attacks; news_num: 225; tweets_num: 1260586
	Writing topic_news doc ...
	Writing topic_tweets doc ...
41 Topic_name: Vanuatu_Cyclone_Pam; news_num: 89; tweets_num: 409029
	Writing topic_news doc ...
	Writing topic_tweets doc ...
42 Topic_name: Malaysia_Airlines_Flight_MH370_crash; news_num: 58; tweets_num: 529421
	Writing topic_news doc ...
	Writing topic_tweets doc ...
43 Topic_name: Colorado_NAACP_bombing; news_num: 38; tweets_num: 345563
	Writing topic_news doc ...
	Writing topic_tweets doc ...
44 Topic_name: FSU_shooting; news_num: 39; tweets_num: 171822
	Writing topic_news doc ...
	Writing topic_tweets doc ...
45 Topic_name: Chapel_Hill_shooting; news_num: 37; tweets_num: 89304
	Writing topic_news doc ...
	Writing topic_tweets doc ...
46 Topic_name: Bobbi_Kristina_Brown_death; news_num: 49; tweets_num: 129493
	Writing topic_news doc ...
	Writing topic_tweets doc ...
47 Topic_name: Taliban_Pakistan_school_massacre; news_num: 80; tweets_num: 286289
	Writing topic_news doc ...
	Writing topic_tweets doc ...
48 Topic_name: American_ISIS_Hostage_Kayla_Mueller; news_num: 38; tweets_num: 74058
	Writing topic_news doc ...
	Writing topic_tweets doc ...
49 Topic_name: TransAsia_Airways_Flight_GE235_crash; news_num: 56; tweets_num: 220522
	Writing topic_news doc ...
	Writing topic_tweets doc ...
50 Topic_name: Germanwings_Flight_9525_crash; news_num: 71; tweets_num: 341486
	Writing topic_news doc ...
	Writing topic_tweets doc ...
CPU times: user 13min 58s, sys: 34.4 s, total: 14min 32s
Wall time: 16min 21s

Re-build topic_tweets docs. Remove tweets with the same or silimar tweet_text field


In [2]:
%%time
"""
Re-build topic_tweets docs. Remove tweets with the same or silimar tweet_text field

For each topic
 - read in original topic_tweets doc
 - remove tweets with the same or silimar tweet_text field
 - write filtered tweets into updated topic_tweets doc
 
All docs are placed inside config.TOPICS_DOCS_DIR
 - updated topic_tweets doc follows name convention: [topic_ind]-[topic_name].updated.tweets.csv
"""

if 0 == 1:
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        filtered_tweets_lst = []
        
        # set for unique tweet_text field (after remvoing all URL and hashtag entities)
        unique_tweet_text_set = set()
        
        '''
        Read in original topic_tweets doc
        '''
        csv.register_dialect('topics_docs_line', delimiter='\t', doublequote=True, quoting=csv.QUOTE_ALL)
        topic_tweets_csv_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.tweets.csv'.format(topic_ind, topic['name']))
        with open(topic_tweets_csv_file, 'r') as f:
            reader = csv.DictReader(f, dialect='topics_docs_line')
            
            '''
            Remove tweets with the same or silimar tweet_text field
            '''
            # lazy load
            for row in reader:
                tweet_text = row['tweet_text']
                
                cleaned_tweet_text = utilities.clean_tweet_text(tweet_text)
                
                if cleaned_tweet_text not in unique_tweet_text_set:
                    filtered_tweets_lst.append(row)
                    unique_tweet_text_set.add(cleaned_tweet_text)
        
        """
        Write filtered tweets into updated topic_tweets doc
        """
        output_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.updated.tweets.csv'.format(topic_ind, topic['name']))
        with open(output_file, 'w') as f:
            print('\tWriting updated topic_tweets doc ...')
            fieldnames = ('tweet_id', 'tweet_collected_time', 'tweet_text', 'news_native_id')
            writer = csv.DictWriter(f, fieldnames=fieldnames, dialect='topics_docs_line')
            writer.writeheader()

            for tweet in filtered_tweets_lst:
                writer.writerow({'tweet_id': tweet['tweet_id'],
                                 'tweet_collected_time': tweet['tweet_collected_time'],
                                 'tweet_text': tweet['tweet_text'].replace('\n',' '), # keep each tweet in one line
                                 'news_native_id': tweet['news_native_id']})


(1/51) processing topic: Hillary_Clinton_email_controversy ... Mon Oct 23 15:27:39 2017
	Writing updated topic_tweets doc ...
(2/51) processing topic: Iran_nuclear_deal ... Mon Oct 23 15:28:14 2017
	Writing updated topic_tweets doc ...
(3/51) processing topic: ISIS_Jihadi_John_identity_reveal ... Mon Oct 23 15:29:59 2017
	Writing updated topic_tweets doc ...
(4/51) processing topic: Ukraine_cease_fire ... Mon Oct 23 15:30:26 2017
	Writing updated topic_tweets doc ...
(5/51) processing topic: Egypt_free_Al_Jazeera_journalist ... Mon Oct 23 15:30:50 2017
	Writing updated topic_tweets doc ...
(6/51) processing topic: Keystone_XL_Pipeline_bill ... Mon Oct 23 15:30:55 2017
	Writing updated topic_tweets doc ...
(7/51) processing topic: CIA_Torture_Report ... Mon Oct 23 15:31:00 2017
	Writing updated topic_tweets doc ...
(8/51) processing topic: Obama_cybersecurity_plan ... Mon Oct 23 15:31:07 2017
	Writing updated topic_tweets doc ...
(9/51) processing topic: DHS_funding_issue ... Mon Oct 23 15:31:26 2017
	Writing updated topic_tweets doc ...
(10/51) processing topic: US_Cuba_relationship ... Mon Oct 23 15:31:31 2017
	Writing updated topic_tweets doc ...
(11/51) processing topic: 2015_CPAC ... Mon Oct 23 15:32:26 2017
	Writing updated topic_tweets doc ...
(12/51) processing topic: Iraq_free_ISIS_Tikrit ... Mon Oct 23 15:32:38 2017
	Writing updated topic_tweets doc ...
(13/51) processing topic: Nigeria_Boko_Haram_terrorists ... Mon Oct 23 15:33:01 2017
	Writing updated topic_tweets doc ...
(14/51) processing topic: Ferguson_unrest ... Mon Oct 23 15:33:35 2017
	Writing updated topic_tweets doc ...
(15/51) processing topic: Hong_Kong_protest ... Mon Oct 23 15:35:59 2017
	Writing updated topic_tweets doc ...
(16/51) processing topic: Sony_cyberattack ... Mon Oct 23 15:36:20 2017
	Writing updated topic_tweets doc ...
(17/51) processing topic: Bill_Cosby_sexual_assault_allegation ... Mon Oct 23 15:37:39 2017
	Writing updated topic_tweets doc ...
(18/51) processing topic: SpaceX_rocket_landing ... Mon Oct 23 15:38:05 2017
	Writing updated topic_tweets doc ...
(19/51) processing topic: Brian_Williams_fake_story ... Mon Oct 23 15:38:19 2017
	Writing updated topic_tweets doc ...
(20/51) processing topic: HSBC_tax_scandal ... Mon Oct 23 15:38:37 2017
	Writing updated topic_tweets doc ...
(21/51) processing topic: David_Carr_death ... Mon Oct 23 15:38:40 2017
	Writing updated topic_tweets doc ...
(22/51) processing topic: Patriots_Deflategate ... Mon Oct 23 15:38:45 2017
	Writing updated topic_tweets doc ...
(23/51) processing topic: Delhi_Uber_driver_rape ... Mon Oct 23 15:38:57 2017
	Writing updated topic_tweets doc ...
(24/51) processing topic: Superbug_spread ... Mon Oct 23 15:39:13 2017
	Writing updated topic_tweets doc ...
(25/51) processing topic: Rudy_Giuliani_Obama_critique ... Mon Oct 23 15:39:24 2017
	Writing updated topic_tweets doc ...
(26/51) processing topic: Oscar ... Mon Oct 23 15:39:37 2017
	Writing updated topic_tweets doc ...
(27/51) processing topic: Super_Bowl ... Mon Oct 23 15:40:57 2017
	Writing updated topic_tweets doc ...
(28/51) processing topic: Grammy ... Mon Oct 23 15:42:08 2017
	Writing updated topic_tweets doc ...
(29/51) processing topic: Golden_Globe ... Mon Oct 23 15:42:38 2017
	Writing updated topic_tweets doc ...
(30/51) processing topic: 500_million_Powerball ... Mon Oct 23 15:43:12 2017
	Writing updated topic_tweets doc ...
(31/51) processing topic: Thanksgiving ... Mon Oct 23 15:43:33 2017
	Writing updated topic_tweets doc ...
(32/51) processing topic: Black_Friday_and_Cyber_Monday ... Mon Oct 23 15:45:02 2017
	Writing updated topic_tweets doc ...
(33/51) processing topic: Christmas ... Mon Oct 23 15:46:11 2017
	Writing updated topic_tweets doc ...
(34/51) processing topic: New_Year ... Mon Oct 23 15:48:24 2017
	Writing updated topic_tweets doc ...
(35/51) processing topic: Apple_Watch ... Mon Oct 23 15:49:15 2017
	Writing updated topic_tweets doc ...
(36/51) processing topic: Yosemite_historic_climb ... Mon Oct 23 15:49:40 2017
	Writing updated topic_tweets doc ...
(37/51) processing topic: Jon_Stewart_Daily_Show ... Mon Oct 23 15:49:44 2017
	Writing updated topic_tweets doc ...
(38/51) processing topic: success_of_American_Sniper ... Mon Oct 23 15:49:53 2017
	Writing updated topic_tweets doc ...
(39/51) processing topic: Ebola_virus_spread ... Mon Oct 23 15:50:24 2017
	Writing updated topic_tweets doc ...
(40/51) processing topic: Indonesia_AirAsia_Flight_QZ8501_crash ... Mon Oct 23 15:51:01 2017
	Writing updated topic_tweets doc ...
(41/51) processing topic: Paris_attacks ... Mon Oct 23 15:51:49 2017
	Writing updated topic_tweets doc ...
(42/51) processing topic: Vanuatu_Cyclone_Pam ... Mon Oct 23 15:52:44 2017
	Writing updated topic_tweets doc ...
(43/51) processing topic: Malaysia_Airlines_Flight_MH370_crash ... Mon Oct 23 15:52:59 2017
	Writing updated topic_tweets doc ...
(44/51) processing topic: Colorado_NAACP_bombing ... Mon Oct 23 15:53:20 2017
	Writing updated topic_tweets doc ...
(45/51) processing topic: FSU_shooting ... Mon Oct 23 15:53:34 2017
	Writing updated topic_tweets doc ...
(46/51) processing topic: Chapel_Hill_shooting ... Mon Oct 23 15:53:41 2017
	Writing updated topic_tweets doc ...
(47/51) processing topic: Bobbi_Kristina_Brown_death ... Mon Oct 23 15:53:45 2017
	Writing updated topic_tweets doc ...
(48/51) processing topic: Taliban_Pakistan_school_massacre ... Mon Oct 23 15:53:50 2017
	Writing updated topic_tweets doc ...
(49/51) processing topic: American_ISIS_Hostage_Kayla_Mueller ... Mon Oct 23 15:54:01 2017
	Writing updated topic_tweets doc ...
(50/51) processing topic: TransAsia_Airways_Flight_GE235_crash ... Mon Oct 23 15:54:04 2017
	Writing updated topic_tweets doc ...
(51/51) processing topic: Germanwings_Flight_9525_crash ... Mon Oct 23 15:54:12 2017
	Writing updated topic_tweets doc ...
CPU times: user 26min 1s, sys: 36.6 s, total: 26min 37s
Wall time: 26min 48s