In [1]:
#!/usr/bin/evn python

#Author: Mihai Avram, e-mail: mihai.v.avram@gmail.com

#ALL IMPORTS
#for parsing the data in the logs
import json
#for connecting to the database
import psycopg2
#for error logging
import sys, traceback
#for timing purposes
import time

#ALL FUNCTIONS
#function for deciding on a score value to use for bot_score_english and bot_score_universal depending on what's available in the log
def score_decider(potential_score_keys, line_json):
    for key in potential_score_keys:
        if len(key) == 1:
            key1 = key[0]
            try:
                score = line_json[key1]
                return score
            except:
                continue
        elif len(key) == 2:
            key1 = key[0]
            key2 = key[1]
            try:
                score = line_json[key1][key2]
                return score
            except:
                continue
    
    return None

#inserts log to database
def log_insertion_script(user_id, screen_name, time_stamp, all_bot_scores, bot_score_english, \
                bot_score_universal, requester_ip, tweets_per_day, num_submitted_timeline_tweets, \
                num_submitted_mention_tweets, num_requests):
    
    botbase_cursor.execute("""INSERT INTO public.botscore(
                user_id, screen_name, time_stamp, all_bot_scores, bot_score_english, 
                bot_score_universal, requester_ip, tweets_per_day, num_submitted_timeline_tweets, 
                num_submitted_mention_tweets, num_requests) 
                              VALUES 
                (%s, %s, to_timestamp(%s), %s, %s, %s, %s, %s, %s, %s, %s);""", \
                (user_id, screen_name, time_stamp, json.dumps(all_bot_scores), bot_score_english, \
                bot_score_universal, requester_ip, tweets_per_day, num_submitted_timeline_tweets, \
                num_submitted_mention_tweets, num_requests))

    #commiting changes
    pgsqlconn.commit()

#GLOBAL VARIABLES
total_number_of_lines_parsed = 0
records_committed = 0
errors_and_informational_count = 0
unmatched_botscore_category_schema_count = 0
json_not_proper_log_count = 0
json_with_no_type_count = 0
failed_to_retrieve_proper_fields_count = 0
failed_to_commit_to_db_count = 0
    
#MAIN CODE
if __name__ == '__main__':
    #connecting to the database
    pgsqlconn = psycopg2.connect(host='localhost', user='postgres', password='password', dbname='botbase')
    #cursor needed to execute db operations
    botbase_cursor = pgsqlconn.cursor()
    #starting timer
    timer_start = time.time()
    
    #log name and location information
    #log_path = '/home/mavram/Research/HoaxyBotometer/ImportBackuplogsTask/logs/backups/unzipstage/'
    log_path = '/media/marvram/OS/Research/HoaxyBotometer/ImportBackuplogsTask/logs/'
    log_file_list = ['botornot.log.2017-09-03','botornot.log.2017-09-10']
                    #'botornot.log201506',
                     #, 'botornot.log201510', 'botornot.log201605', 'botornot.log201701', \
                     #'botornot.log201702', 'botornot.log201705', 'botornot.log.2017-05-14', 'botornot.log.2017-05-21', \
                     #'botornot.log.2017-05-28', 'botornot.log.2017-06-04', 'botornot.log.2017-06-11', 'botornot.log.2017-06-18', \
                     #'botornot.log.2017-06-25', 'botornot.log.2017-07-02', 'botornot.log.2017-07-09', 'botornot.log.2017-07-16', \
                     #'botornot.log.2017-07-23', 'botornot.log.2017-07-30', 'botornot.log.2017-08-06', 'botornot.log.2017-08-13']
                    #recent
                    #botornot.log.2017-08-20  botornot.log.2017-08-27
    #log to store any errors due to the logs not containing the proper data (i.e. other logging information such as errors or other requests)
    error_log_file = open("botscoreloginsertion.err", "a")
  
    #iterating through all log files
    for log in log_file_list:
        print("Starting to import log: ", log)
        sys.stdout.flush()
        file_location = log_path + log

        #parsing logs and uploading the entries to the botometer database
        log_file = open(file_location,"r")

        for line_num, line in enumerate(log_file, start = 1):
            total_number_of_lines_parsed = total_number_of_lines_parsed + 1
            
            #checking if the current line is json, if not then this line should not be parsed because we are only looking for json log lines
            try: 
                line_json = json.loads(line)
            except:
                errors_and_informational_count = errors_and_informational_count + 1
                continue
            
            try:
                if not line_json["type"] == "log":
                    json_not_proper_log_count = json_not_proper_log_count + 1
                    continue
            except:
                json_with_no_type_count = json_with_no_type_count + 1
                error_log_file.write("NO-LOG-TYPE-JSON INFO---File: " + log + " LineNumber: " + str(line_num) + " Error: " + str(sys.exc_info()[0]) + "\n")
                continue
            
            #parsing json line and retrieving the proper fields regarding the user i.e. user id, screen name, tweets, etc...
            try:
                user_id = line_json["search"]["user_id"]
                screen_name = str(line_json["search"]["sn"])
                if len(screen_name) > 15:
                    #user may have a screen-name logged as longer than 15 characters which is not proper in Twitter and could be instead the userid or some other error so we make it none
                    screen_name = None
                time_stamp = line_json["timestamp"]
                #some timestamps are stored in milliseconds so for those we divide by 1000
                if len(str(time_stamp)) >= 12:
                    time_stamp = time_stamp/1000
                botscore_representation = line_json["categories"]
                
                #parsing the bot scores to match the schema from here https://market.mashape.com/OSoMe/botometer
                try:
                    friend_score = round(botscore_representation['friend_classification'], 2)
                    sentiment_score = round(botscore_representation['sentiment_classification'], 2)
                    temporal_score = round(botscore_representation['temporal_classification'], 2)
                    user_score = round(botscore_representation['user_classification'], 2)
                    network_score = round(botscore_representation['network_classification'], 2)
                    content_score = round(botscore_representation['content_classification'], 2)
                    all_bot_scores = {"friend": friend_score, "sentiment": sentiment_score, "temporal": temporal_score, "user": user_score, "network": network_score, "content": content_score}
                except:
                    #parsing another representation which is exactly as the mashape botscore api
                    try:
                        friend_score = round(botscore_representation['friend'], 2)
                        sentiment_score = round(botscore_representation['sentiment'], 2)
                        temporal_score = round(botscore_representation['temporal'], 2)
                        user_score = round(botscore_representation['user'], 2)
                        network_score = round(botscore_representation['network'], 2)
                        content_score = round(botscore_representation['content'], 2)
                        all_bot_scores = {"friend": friend_score, "sentiment": sentiment_score, "temporal": temporal_score, "user": user_score, "network": network_score, "content": content_score}
                    except:
                        #parsing a list schema instead of json schema i.e. [["network",0.5], ["sentiment",0.2], ...]
                        try:
                            friend_score = round(botscore_representation[5][1], 2)
                            sentiment_score = round(botscore_representation[1][1], 2)
                            temporal_score = round(botscore_representation[2][1], 2)
                            user_score = round(botscore_representation[4][1], 2)
                            network_score = round(botscore_representation[0][1], 2)
                            content_score = round(botscore_representation[3][1], 2)
                            all_bot_scores = {"friend": friend_score, "sentiment": sentiment_score, "temporal": temporal_score, "user": user_score, "network": network_score, "content": content_score}                            
                        except:
                             #score schema does not include the needed scores so will insert as null
                            unmatched_botscore_category_schema_count = unmatched_botscore_category_schema_count + 1
                            error_log_file.write("NON-MATCHED-CATEGORY-SCHEMA INFO---File: " + log + " LineNumber: " + str(line_num) + " Error: " + str(sys.exc_info()[0]) + "\n")
                            all_bot_scores = None
                #english bot score which is either found in line_json["score"], line_json["classification"], line_json["score"]["english"]
                keys = [["score","english"],["score"],["classification"]]
                bot_score_english = score_decider(keys, line_json)
                #universal bot score which is either found in line_json["score"]["universal"] or line_json["categories"]["languageagnostic_classification"] otherwise null
                keys = [["score","universal"],["categories","languageagnostic_classification"]]
                bot_score_universal = score_decider(keys, line_json)
                #storing a comma delimited string of ips
                requester_ip = line_json["remote_ip"]
                #some ips are stored in lists and some not, must distinguish and treat them separately here
                #in order to yield <ip1>,<ip2>,etc...
                if type(requester_ip) == list:
                    requester_ip = ','.join(line_json["remote_ip"])                
                tweets_per_day = None
                num_submitted_timeline_tweets = None
                num_submitted_mention_tweets = None
                num_requests = 0
            except:
                error_log_file.write("NON-PROPER-FIELDS ERROR---File: " + log + " LineNumber: " + str(line_num) + " Error: " + str(sys.exc_info()[0]) + "\n")
                failed_to_retrieve_proper_fields_count = failed_to_retrieve_proper_fields_count + 1
                continue
                
            try:
                #inserting data to the database
                log_insertion_script(user_id, screen_name, time_stamp, all_bot_scores, bot_score_english, bot_score_universal, \
                            str(requester_ip), tweets_per_day, num_submitted_timeline_tweets, num_submitted_mention_tweets, num_requests)
                records_committed = records_committed + 1
            except:
                error_log_file.write("DB INSERTION ERROR---File: " + log + " LineNumber: " + str(line_num) + " Error: " + str(sys.exc_info()[0]) + "\n")
                failed_to_commit_to_db_count = failed_to_commit_to_db_count + 1          
                continue

        print("Finished importing log: ", log)
        sys.stdout.flush()

    #closing access to database
    botbase_cursor.close()
    pgsqlconn.close()

    #closing log files
    log_file.close()
    error_log_file.close()

    #ending and evaluating time elapsed
    print("%s seconds elapsed" % (time.time()-timer_start))
    print("Log Import Process Completed!")
    
    #printing log statistics
    print("LOG IMPORT PROCESS INFORMATION:")
    print("total-lines-parsed: ", total_number_of_lines_parsed)
    print("records-committed: ", records_committed)
    print("non-json-lines: ",errors_and_informational_count)
    print("non-log-json-type: ", json_not_proper_log_count)
    print("json-with-no-type: ", json_with_no_type_count)
    print("non-matched-proper-score-category-schema: ", unmatched_botscore_category_schema_count)
    print("non-proper-fields-upon-retrieval: ", failed_to_retrieve_proper_fields_count)
    print("db-commit-failures: ", failed_to_commit_to_db_count)


Starting to import log:  test
Finished importing log:  test
0.34769105911254883 seconds elapsed
Log Import Process Completed!
LOG IMPORT PROCESS INFORMATION:
total-lines-parsed:  1
records-committed:  1
non-json-lines:  0
non-log-json-type:  0
json-with-no-type:  0
non-matched-proper-score-category-schema:  0
non-proper-fields-upon-retrieval:  0
db-commit-failures:  0

In [ ]: