In [1]:

    
from pymongo import MongoClient
from tqdm import tqdm



In [2]:

    
client = MongoClient()
db = client['rf_test']
col_entries = db['entries']
col_inst = db['inst']
col_inds = db['inds']
col_nouns = db['nouns']

First look at entries

What are those?



In [3]:

    
def list_entries():
    d = col_entries.distinct("type")
    return [x for x in d]

What do we have the most?



In [8]:

    
entries = Out[4]



In [12]:

    
def count_entry():
    entry_count = []
    for entry in entries:
        c = col_entries.find({"type":entry}).count()
        print("{}|{}".format(c,entry))
        entry_count.append((c,entry))
    return entry_count



In [14]:

    
entry_count = Out[13]



In [17]:

    
sorted(entry_count, key= lambda x: x[0])[::-1]









    Out[17]:





[(177928, 'URL'),
 (52303, 'Username'),
 (11210, 'InternetDomainName'),
 (7060, 'Source'),
 (4916, 'GeoEntity'),
 (4661, 'Hashtag'),
 (4238, 'Company'),
 (3700, 'Person'),
 (3307, 'OrgEntity'),
 (2704, 'City'),
 (2616, 'Malware'),
 (2196, 'IndustryTerm'),
 (2049, 'ProvinceOrState'),
 (1917, 'Organization'),
 (1384, 'Product'),
 (923, 'Technology'),
 (861, 'IpAddress'),
 (680, 'Position'),
 (452, 'Facility'),
 (335, 'MalwareSignature'),
 (332, 'FileName'),
 (210, 'Country'),
 (207, 'Region'),
 (126, 'PublishedMedium'),
 (125, 'Hash'),
 (123, 'EmailAddress'),
 (97, 'CyberVulnerability'),
 (66, 'Industry'),
 (48, 'MetaType'),
 (43, 'NaturalFeature'),
 (42, 'AttackVector'),
 (29, 'Feature'),
 (28, 'Topic'),
 (25, 'OperatingSystem'),
 (23, 'Operation'),
 (22, 'TVShow'),
 (18, 'SourceMediaType'),
 (16, 'Religion'),
 (16, 'MedicalCondition'),
 (15, 'EntityList'),
 (15, 'MalwareCategory'),
 (15, 'WinRegKey'),
 (13, 'Holiday'),
 (12, 'Commodity'),
 (12, 'ProgrammingLanguage'),
 (9, 'MarketIndex'),
 (8, 'CyberExploitTargetCategory'),
 (8, 'Sector'),
 (7, 'Continent'),
 (6, 'ASNumber'),
 (2, 'IRCNetwork'),
 (2, 'MedicalTreatment'),
 (1, 'Anniversary'),
 (1, 'User'),
 (1, 'CyberThreatActorCategory'),
 (1, 'TechnologyArea')]

In these sources of data, usernames seem interesting.

Usernames

First, we want to look through what are the usernames we have in the database. To find all usernames, we can do:



In [3]:

    
def find_all_usernames():
    return [x for x in col_entries.find({"type":"Username"})]

Since there are about 50k usernames, let's just look at a random one:



In [4]:

    
find_all_usernames()[30]









    Out[4]:





{'_id': ObjectId('58d9f0e4df3e728bd7e112e7'),
 'created': '2014-07-16T14:44:21.851Z',
 'created_at': '2013-12-09T15:48:56.000Z',
 'curated': 0,
 'domain': 'B_E-R',
 'hits': 40,
 'id': 'LTRvjC',
 'meta_type': 'type:Username',
 'name': '@felicita_andrad',
 'type': 'Username'}

Note the 'id' field. Since this correspond to the author field in the instance, we can use this to count the contributions of the username. Therefore, for each username, we can count or list all of the instance that they are associated with.



In [5]:

    
def find_inst_by_username(userid):
    return [x for x in col_inst.find({"attributes.authors": userid})]



In [6]:

    
find_inst_by_username("LTRvjC")









    Out[6]:





[{'_id': ObjectId('58d9f0b6df3e728bd7de539f'),
  'attributes': {'analyzed': '2015-09-21T17:29:21.094Z',
   'authors': ['LTRvjC'],
   'binning_id': 'GoX8I15DMMc',
   'canonic_id': 'GoX8I15DMMc',
   'document_external_id': '646013299067228160',
   'document_offset': 0,
   'document_url': 'url:https://twitter.com/felicita_andrad/statuses/646013299067228160',
   'entities': ['url:http://www.20minutos.es/noticia/2561290/0/app-store-apple/victima-ciberataque-wechat/xcodeghost/',
    'JDYK14',
    'B_LyO',
    'O8A5tj'],
   'fragment_count': 1,
   'function': 'id',
   'general_negative': 0.0,
   'general_positive': 0.0,
   'indicator': 'ciberataque',
   'meta_type': 'type:CyberAttack',
   'negative': 0.0,
   'positive': 0.0,
   'sentiments': {'activism': 0.0,
    'general_negative': 0.0,
    'general_positive': 0.0,
    'negative': 0.0,
    'positive': 0.0,
    'violence': 0.0},
   'target': ['JDYK14'],
   'target_string': 'EFE Un',
   'topics': ['KPzZAE'],
   'user_data': {'followers_count': 39,
    'friends_count': 36,
    'statuses_count': 1375},
   'violence': 0.0},
  'cluster_id': 'BG-xCBk6KSk',
  'cluster_ids': ['BG-xCBk6KSk'],
  'document': {'downloaded': '2015-09-21T17:29:20.602Z',
   'id': 'O9QHDL',
   'indexed': '2015-09-21T17:29:21.272Z',
   'language': 'spa',
   'published': '2015-09-21T17:29:07.000Z',
   'sourceId': {'country': 'United States',
    'description': 'Twitter',
    'id': 'BV5',
    'media_type': 'JxSEtC',
    'name': 'Twitter'},
   'title': 'App Store de Apple sufre su primer gran ciberataque: EFE Un software infectado, denominado XcodeGhost, ha afec... http://t.co/RBN25ohpqC',
   'url': 'https://twitter.com/felicita_andrad/statuses/646013299067228160'},
  'fragment': 'App Store de Apple sufre su primer gran ciberataque: EFE Un software infectado, denominado XcodeGhost, ha afec... http://t.co/RBN25ohpqC.',
  'id': 'GVwn7jAnIpd',
  'item_fragment': 'ciberataque: EFE Un',
  'precision': 'ms',
  'start': '2015-09-21T17:29:07.000Z',
  'stop': '2015-09-21T17:29:07.000Z',
  'tagged_fragment': 'App Store de <e id=B_LyO>Apple</e> sufre su primer gran <i id=GVwn7jAnIpd>ciberataque: <e id=JDYK14>EFE Un</e></i> software infectado, denominado <e id=O8A5tj>XcodeGhost</e>, ha afec... <e id=url:http://www.20minutos.es/noticia/2561290/0/app-store-apple/victima-ciberataque-wechat/xcodeghost/>http://t.co/RBN25ohpqC</e>.',
  'time_type': 'in',
  'type': 'CyberAttack'}]

We could also go through the data and see which username "have the most to say".



In [7]:

    
def username_ranking():
    usernames_ranks = []
    usernames = find_all_usernames()[:100]
    for u in tqdm(usernames):
        uid = u['id']
        name = u['name']
        count = col_inst.find({"attributes.authors":uid}).count()
        usernames_ranks.append((uid, name, count))
    return sorted(usernames_ranks, key=lambda x: x[2])

On my computer, this process would take around 5 hours to go through the complete dataset and count up all the usernames, we got about 3 username per seconds. So I only did about the first 100 usernames



In [8]:

    
u_ranks = username_ranking()









    



100%|██████████| 100/100 [00:33<00:00,  3.03it/s]

For example, in this small subset, the user with the most instance connected to, is



In [11]:

    
u_ranks[-1]









    Out[11]:





('KKYGPH', '@CKLeetwt', 18)



In [12]:

    
def get_inst_for_user(uid):
    return [x for x in col_inst.find({"attributes.authors":uid})]

What does this user talks about?



In [15]:

    
[x['attributes']['indicator'] for x in get_inst_for_user('KKYGPH')]









    Out[15]:





['Gameover Zeus',
 'SYNful Knock',
 'malware',
 'Superfish',
 'Carberp',
 'Moose malware may infect Linux - based routers',
 'Duqu2',
 'malware',
 'SpyEye',
 'Dyreza',
 'Dridex Botnet',
 'Dridex',
 'Thunderstrike',
 'RIG exploit kit infects 1 million PCs',
 'Stuxnet',
 'Potao malware used to spy on targets in Ukraine, Russia',
 'TorrentLocker',
 'Trojan']



In [16]:

    
from nltk.tag import pos_tag



In [17]:

    
def find_nouns(sentence):
    words = sentence.split()
    tagged = pos_tag(words)
    return [w for w, t in tagged if t == 'NNP']

Here, we will look at everything the user has said, and count all the proper nouns.



In [22]:

    
def words_from_user(uid):
    insts = get_inst_for_user('KKYGPH')
    sentences = [x['fragment'] for x in insts]
    words = {}
    for s in sentences:
        ns = find_nouns(s)
        for n in ns:
            try:
                words[n] += 1
            except KeyError:
                words[n] = 1
    return words



In [26]:

    
sorted(((a,b) for a,b in words_from_user('KKYGPH').items()), key=lambda x: x[1])[::-1]









    Out[26]:





[('Dridex', 3),
 ('|', 3),
 ('Carberp', 2),
 ('Cisco', 2),
 ('RT', 2),
 ('Zeus', 2),
 ('Rt', 2),
 ('#ja….', 1),
 ('#malware', 1),
 ('#cybercrime', 1),
 ('Shifu:', 1),
 ('@icyberfighter:', 1),
 ('#ransomware', 1),
 ('#TorrentLocker', 1),
 ('Zealand)', 1),
 ('ANZ', 1),
 ('Russia', 1),
 ('Ukraine,', 1),
 ('Potao', 1),
 ('Korea', 1),
 ('North', 1),
 ('United', 1),
 ('Stuxnet', 1),
 ('ICYMI:', 1),
 ('RIG', 1),
 ('Symantec', 1),
 ('VB', 1),
 ('Recent', 1),
 ('Czech', 1),
 ('Botnet', 1),
 ('Proofpoint', 1),
 ('Dyreza,', 1),
 ('SophosLabs:', 1),
 ('Europol', 1),
 ('SpyEye', 1),
 ('Aussies', 1),
 ('Cryptolocker', 1),
 ('Australia', 1),
 ('LAST:', 1),
 ('AT', 1),
 ('Foxconn', 1),
 ('Moose', 1),
 ('#Australia', 1),
 ('Hemisphere', 1),
 ('Southern', 1),
 ('Works', 1),
 ('Superfish', 1),
 ("Lenovo's", 1),
 ("Knock'", 1),
 ('@SCMagazineAU:', 1),
 ('Shadowserver', 1),
 ('Knock', 1),
 ('SYNful', 1),
 ('@regsecurity:', 1),
 ('Gameover', 1),
 ('Fbi', 1),
 ('@esecurityp:', 1)]

Uhmm, apparently, this user talks about Dridex more than other. This still needs a lot of improvement. One big improvement this approach can use is to categorize the noun. This will give us some context to what the user is talking about.

Furthermore, we can compare the user and group them by what they are talking about. This come back to our instance database. For example, let's look at an our Dridex malware



In [46]:

    
def people_and_instance(indicator):
    c = col_inst.find({"attributes.indicator": indicator})
    c = [x for x in c]
    authors = []
    authors_info = {}
    for entry in c:
        try:
            author = entry['attributes']['authors']
        except KeyError:
            author = None
        authors.append(author)
    for a in tqdm(authors):
        if a:
            en = col_entries.find_one({"id": a[0]})
            authors_info[en['id']] = {"name": en['name']}
    return authors_info



In [48]:

    
talks_of_dridex = Out[29]



In [32]:

    
len(Out[29]) #There are 468 entries about Dridex, sweet !!









    Out[32]:





468



In [47]:

    
people_and_instance('Dridex')









    



100%|██████████| 468/468 [00:33<00:00, 19.11it/s]






    Out[47]:





{'K05qIa': {'name': '@NiightlyCat'},
 'K17xcT': {'name': '@ScottiAlbertoG'},
 'K5dKsC': {'name': '@submoodle'},
 'K7x5By': {'name': '@quallimited'},
 'K9J08m': {'name': '@jabolins'},
 'K9eq1h': {'name': '@Avosec'},
 'KF3KQS': {'name': '@JA25000'},
 'KF8pTm': {'name': '@ximad'},
 'KF9gR7': {'name': '@Seifreed'},
 'KFK0xQ': {'name': '@bartblaze'},
 'KFK57Q': {'name': '@AdamLangePL'},
 'KFKXsd': {'name': '@UK_Tech_News'},
 'KFKg5Z': {'name': '@InfosecNewsBot'},
 'KFKmZP': {'name': '@Insecurestuff'},
 'KFLYUo': {'name': '@SecurityNews'},
 'KFMDfg': {'name': '@BrianHonan'},
 'KFMJsp': {'name': '@MalwareMustDie'},
 'KFMT7t': {'name': '@upgradeoptions'},
 'KFMk-r': {'name': '@evanderburg'},
 'KFNVB9': {'name': '@wopot'},
 'KFO8a5': {'name': '@ciperovich'},
 'KFPMax': {'name': '@ITDataSecurity'},
 'KFPthV': {'name': '@Cephurs'},
 'KFPuks': {'name': '@c_APT_ure'},
 'KFQ5tn': {'name': '@ioerror'},
 'KFQITr': {'name': '@Security_FAQs'},
 'KFRzvY': {'name': '@lemoine_vincent'},
 'KFSwQj': {'name': '@Tinolle'},
 'KFTlyU': {'name': '@benkow_'},
 'KFTq5q': {'name': '@HackerSpyNet'},
 'KFTwwV': {'name': '@dlemckert'},
 'KFUkY1': {'name': '@virusbtn'},
 'KFVuR_': {'name': '@ChristiaanBeek'},
 'KFWmx7': {'name': '@Cyber_War_News'},
 'KFXuWH': {'name': '@sans_isc'},
 'KFa0WF': {'name': '@EconomicMayhem'},
 'KFaYH_': {'name': '@psautjeau'},
 'KFcAYL': {'name': '@PhysicalDrive0'},
 'KFcHXv': {'name': '@SoLatiK'},
 'KFe6ox': {'name': '@Z9M9Z'},
 'KFfDhi': {'name': '@crmunoz27'},
 'KFfKlU': {'name': '@bhconsulting'},
 'KFgSll': {'name': '@stopbadware'},
 'KFjonn': {'name': '@c0d3xpl0it'},
 'KFkNxG': {'name': '@malekal_morte'},
 'KFkhH3': {'name': '@AlJnErAl'},
 'KFneYb': {'name': '@websense'},
 'KFrEmJ': {'name': '@EGeorgantas'},
 'KFta3y': {'name': '@ianbeckett'},
 'KFwaNl': {'name': '@cyberthrone_war'},
 'KFxpUa': {'name': '@chernobyl1986'},
 'KFydkJ': {'name': '@sidoyle'},
 'KG5QOY': {'name': '@Techhelplistcom'},
 'KG6CtV': {'name': '@BRIGHTZEED'},
 'KG7O9Z': {'name': '@Vircom_Inc'},
 'KGCZQM': {'name': '@Technol_news'},
 'KGJqnS': {'name': '@BelchSpeak'},
 'KGPRzE': {'name': '@christruncer'},
 'KGQUws': {'name': '@MultiNetRo'},
 'KGRhOC': {'name': '@ConradLongmore'},
 'KGStfw': {'name': '@fknsec'},
 'KGStkY': {'name': '@abhie'},
 'KGTDhO': {'name': '@CliveJBN'},
 'KGTZbT': {'name': '@Radarbot'},
 'KGV8ur': {'name': '@mad_gav'},
 'KGXpru': {'name': '@spyd3r'},
 'KGfM2q': {'name': '@ajohn76'},
 'KGhRZl': {'name': '@EnriquePernas'},
 'KGjkRZ': {'name': '@JamesGoz'},
 'KGqEOt': {'name': '@newsfeit'},
 'KGqKtH': {'name': '@Radio_Powermix'},
 'KGrvzE': {'name': '@m_spreitz'},
 'KGum-e': {'name': '@twilleer'},
 'KGy2Md': {'name': '@codinguy'},
 'KHDUOq': {'name': '@simoncrosby'},
 'KHFpAg': {'name': '@SCmagazineUK'},
 'KH_l08': {'name': '@kjetildahlseng'},
 'KHpMJm': {'name': '@aelsmartin'},
 'KHz94M': {'name': '@chrisdoman'},
 'KI5zNO': {'name': '@angelor'},
 'KIPvac': {'name': '@0xAli'},
 'KIXV5k': {'name': '@Paola_marketing'},
 'KIc-j0': {'name': '@abhinavbom'},
 'KIfKPD': {'name': '@jpalanco'},
 'KJ3IUW': {'name': '@websenselabs'},
 'KJH6g0': {'name': '@senadaruch'},
 'KJMTBN': {'name': '@GJvManen'},
 'KJMWUg': {'name': '@bluejay00'},
 'KJMq-h': {'name': '@ArgentConsultin'},
 'KJdSOm': {'name': '@joepie91'},
 'KK9u6r': {'name': '@DoctorNoFI'},
 'KKYGPH': {'name': '@CKLeetwt'},
 'KKwlx6': {'name': '@siri_urz'},
 'KLS_k5': {'name': '@Nightwolf42'},
 'KMZZeO': {'name': '@creditcardslab'},
 'KN-Hos': {'name': '@shellprompt'},
 'KN7_Oy': {'name': '@briskinfosec'},
 'KNXelC': {'name': '@NC3mobi'},
 'KNYFs1': {'name': '@Mirakshin'},
 'KNrshW': {'name': '@nanderoo'},
 'KO2_rX': {'name': '@cremvn'},
 'KOB1ni': {'name': '@DellSecurity'},
 'KOV6s8': {'name': '@insecurechile'},
 'KOdryw': {'name': '@korezian'},
 'KPCgiG': {'name': '@_jussil_'},
 'KPqkp8': {'name': '@CIGTR'},
 'KQmw0h': {'name': '@acoutal'},
 'KREFax': {'name': '@SteveNixonIP'},
 'KTMICm': {'name': '@FelixEhlers'},
 'KTuH6e': {'name': '@ProfWoodward'},
 'KUFPkp': {'name': '@superstubbs'},
 'KVmLAG': {'name': '@davehull'},
 'KW4-8h': {'name': '@malwaregroup'},
 'KW6Z9b': {'name': '@rodrigovigna'},
 'KWyse8': {'name': '@HuEY_KhrySTAL'},
 'KYedkq': {'name': '@SecurityForest'},
 'KZ5vmj': {'name': '@colinmahns'},
 'KZGTS0': {'name': '@Tears0fSky'},
 'KZWq_0': {'name': 'Malekal_morte'},
 'K_CeAA': {'name': '@fedelemantuano'},
 'K_NzfV': {'name': '@Certego_IRT'},
 'K_OQUI': {'name': '@m4jid_java'},
 'K_ZdbG': {'name': '@felicsjp'},
 'K_inrz': {'name': '@CDSMarine'},
 'KaGV6E': {'name': '@joemj'},
 'KaIIJS': {'name': '@1AmericanAirman'},
 'KaOown': {'name': '@aki0816'},
 'KggOTa': {'name': '@mariusbonde'},
 'KjPMCE': {'name': '@CST2dot0'},
 'Klw6TG': {'name': '@infosecexpert'},
 'KmlaTc': {'name': '@rubendvasquez1'},
 'KmqqmU': {'name': '@doometdotcom'},
 'Kmsemp': {'name': '@douglasmun'},
 'KqGstY': {'name': '@barryporridge'},
 'KrYAAN': {'name': '@Ankit_319'},
 'Ks31NA': {'name': 'ѠOOT'},
 'KshWbZ': {'name': '@neterix'},
 'Ksiu-p': {'name': '@Diwakar_Singh_'},
 'Kt8kLz': {'name': '@datativeUK'},
 'KvnIGb': {'name': '@GanetheGreat'},
 'KzgSs2': {'name': '@KKuehneman'},
 'L21S7Q': {'name': '@jburnsconsult'},
 'L5P3-6': {'name': '@TomL01011011'},
 'L7RFv2': {'name': '@vofnromania'},
 'L8q8_Q': {'name': '@banana_gunso'},
 'LBAcRH': {'name': '@IXCG_Limited'},
 'LB_hZd': {'name': '@SeKuRiGo'},
 'LBuFwT': {'name': '@SOWACOMPUTACION'},
 'LDAnth': {'name': '@rhpco'},
 'LDMcA4': {'name': '@makay_gras'},
 'LEhQMR': {'name': '@Ptr32Void'},
 'LF3cUu': {'name': '@kchr'},
 'LG8qIf': {'name': '@thebhavin_v'},
 'LIMAWR': {'name': '@PelsAndre'},
 'LIo3n-': {'name': '@cisco_dp'},
 'LJa85U': {'name': '@_MKSingh_'},
 'LLyN5u': {'name': '@AnytimeEmail'},
 'LOLu-E': {'name': '@Gate_15_Analyst'},
 'LQ78Ki': {'name': '@jmariocadavid'},
 'LUW9j-': {'name': '@buitreddesierto'},
 'LVIvl5': {'name': '@5y5tem5'},
 'LW0a_M': {'name': '@swsayman'},
 'LW_GpT': {'name': '@_plesna'},
 'Lb294Q': {'name': '@dmred1'},
 'LeIHwR': {'name': '@JobSecUK'},
 'LgvzsA': {'name': '@simon31216'},
 'LjWGcy': {'name': '@Allsecu'},
 'Lm5oQg': {'name': '@__Fermi__'},
 'Lp-vwS': {'name': '@electricgherkin'},
 'LsEum7': {'name': '@BodyGuardFrank'},
 'LuZY8H': {'name': '@sam_wmonks'},
 'LvI7X1': {'name': '@mr_jukai'},
 'Lz5WSD': {'name': '@CasualSec'},
 'MD6BOV': {'name': '@Bry_Campbell'},
 'MFbgwS': {'name': '@thlnk3r'},
 'MH2qDf': {'name': '@mangeshaaaa'},
 'MHzSW9': {'name': '@loucif_kharouni'},
 'MK1tiJ': {'name': '@gh0std4ncer'},
 'MM0Gsg': {'name': '@Data88Geek'},
 'MaevLP': {'name': '@Bulldog_Palm'},
 'MhDamN': {'name': '@SteveRJ_Sans'},
 'Mul-H9': {'name': '@MStingleyTX'},
 'MyuT9a': {'name': '@caboddington'},
 'MzJIVd': {'name': '@RobDoxPower'},
 'N1huQ1': {'name': '@tony_cleal'},
 'N4HI3c': {'name': '@YHVHvx'},
 'NAxzoA': {'name': '@SecurityToday'},
 'NDcy8w': {'name': '@ropchain'},
 'NDvnNb': {'name': '@tuxedo_ha'},
 'NInanW': {'name': '@sehque'},
 'NJHBpw': {'name': '@Franwhitehat'},
 'NNL5bV': {'name': '@JohnsonITUK'},
 'NQnPAY': {'name': 'Consultores y Sistemas 4S, C.A.'},
 'NQpf4G': {'name': '@CrackerHacker00'},
 'NSZ0lc': {'name': '@HburgHackers'},
 'NSyVEG': {'name': '@ErwinsAegis'},
 'NUuRTR': {'name': '@ITG_updates'},
 'NcdE1R': {'name': '@maldevel'},
 'NdHXOy': {'name': '@infosec_blogs'},
 'Ni1hrO': {'name': '@virus_tracker'},
 'Nsst4i': {'name': '@Enterprise_ITS'},
 'Nt0jwC': {'name': 'CeptBiro'},
 'NtYNYi': {'name': '@kazyu56kt'},
 'NwkLkX': {'name': '@CientiH'},
 'NzE0NC': {'name': '@cyb3rops'},
 'O35nCv': {'name': '@TerryBowdenNZ'},
 'O3_eQb': {'name': '@4n0n_HR'},
 'ODrruH': {'name': '@thisisdey'},
 'OGINER': {'name': '@ronin3510'},
 'OI-aoW': {'name': '裴诺'},
 'OLfGwI': {'name': '@Protecdor'},
 'OO2YX9': {'name': '@SFPwN'},
 'OPMOdy': {'name': '@yuunaka18'},
 'OPd3I8': {'name': '@FCSLinus'},
 'OPq1Ff': {'name': '@cyberscTom'},
 'OR1hMo': {'name': '@HoudiniOctopus'},
 'OTS729': {'name': '@68756e6168'},
 'OU3JNb': {'name': '@Socmedmanoxford'},
 'OVSG7Q': {'name': '@_Gen2_'},
 'OY0A6T': {'name': '@e92patrick'},
 'OYzYWV': {'name': 'aggelon'},
 'OZjM4B': {'name': '@PacketKangaroo'},
 'Oawmjf': {'name': '@sezgin063443'},
 'Oge6RM': {'name': '@NWFCyber'},
 'OhsLWg': {'name': '@Dr_v0ly'},
 'OhxST1': {'name': '@LohanOnSecurity'},
 'OiItGt': {'name': '@CowboyNewsBot'},
 'OlmPJF': {'name': '@SecureNewsru'}}

When we have a list of people who talks about an topic, we can iterate through the list of ID, find their sentences and count the noun in those. This way, we could see who are "really interested" in the topic.

A different way of looking at Usernames, especially Twitter username is to look through the relationship aspect. This means that take a twitter handle. We will also look at their following. We suspect that the twitter user would follows subject of importance to them. In this way, we will be able to see other account to put in our crawl list. Furthermore, we can look at a group of people and see if they all talk about the same topic.



In [ ]: