Clustering test data and evaluating clustering technique with it


In [31]:
from bs4 import BeautifulSoup

f = open('../test_data/1957284403.ofs.gold.xml', 'r')
article_text = f.read();
soup = BeautifulSoup(article_text, "lxml")

comment = {
    "bloggerId": "author",
    "sentences": [], # all sentences in a comment,
    "parents": [] # the order depends on how beautifulsoup gives me the parents
}
article = {
    "sentences": {}, # each sentence goes here, hashed by id
    "comments": {} # each comment goes here, hashed by id
}
commentsHTML = soup.findAll('comment')
print commentsHTML[0]
for c in commentsHTML:
    comment_sentences = []
    comment_parents = []


<comment bloggerid="epinoa" id="c0">
<s id="s57">[epinoa] So BT and Vodafone breached the Data Protection Act ? </s>
<comment bloggerid="enfrance " id="c1">
<s id="s58">[enfrance ] Is this Atlantic Bridge in operation ? </s>
</comment>
<comment bloggerid="alex442" id="c2">
<s id="s59">[alex442] Human Rights ? </s>
<s id="s60">Aww , how quaint ... </s>
</comment>
<comment bloggerid="nitpiqin " id="c3">
<s id="s61">[nitpiqin ] That 's what we need to judge them and prosecute them with their own laws . </s>
<s id="s62">They are breaking their own laws . </s>
</comment>
<comment bloggerid="MirandaKeen " id="c4">
<s id="s63">[MirandaKeen ] In the near future , I hope . </s>
</comment>
<comment bloggerid="MirandaKeen " id="c5">
<s id="s64">[MirandaKeen ] And the Universal Declaration of Human RightsArticle 12No one shall be subjected to arbitrary interference with their privacy , family , home or correspondence , nor to attacks upon their honour and reputation . </s>
<s id="s65">Everyone has the right to the protection of the law against such interference and attacks . </s>
</comment>
</comment>

Can I extract just the sentence that belongs to the replied comment?


In [7]:
html = commentsHTML[0]
comms = html.findAll('comment')
first_comm_s = html.findAll('s', recursive=False)
first_comm_s


Out[7]:
[<s id="s57">[epinoa] So BT and Vodafone breached the Data Protection Act ? </s>]

Can I extract all the comment tags, including the nested ones?

Turns out the findAll is recursive and gets me every comment. From there, getting the parents is easy.


In [8]:
for c in commentsHTML:
    if c['id'] == "c4":
        print c
        print [p['id'] for p in c.findParents("comment")]
        break


<comment bloggerid="MirandaKeen " id="c4">
<s id="s63">[MirandaKeen ] In the near future , I hope . </s>
</comment>
['c0']

Therefore, the function to extract the comments is:


In [9]:
def parse_comments(comments):
    '''
    comment = {
        "bloggerId": "author",
        "sentences": [], # all sentences in a comment,
        "parents": [] # the order depends on how beautifulsoup gives me the parents
    }
    '''
    
    parsed_comments = {}
    for c in comments:
        comment = {}
        comment['bloggerId'] = c['bloggerid']
        
        comment['sentences_ids'] = [s['id'] for s in c.findAll('s', recursive=False)]
        comment['parents'] = [p['id'] for p in c.findParents("comment")]
        parsed_comments[c['id']] = comment
    
    return parsed_comments

In [10]:
import json
import pprint
def parse_article(html):
    soup = BeautifulSoup(html, "lxml")
    
    sentences = soup.findAll('s')
    parsed_sentences = {}
    for s in sentences:
        parsed_sentences[s['id']] = s.get_text() 
    
    parsed_comments = parse_comments(soup.findAll('comment'))
    
    article = {
        'sentences': parsed_sentences,
        'comments': parsed_comments
    }

    return article

article = parse_article(article_text)
pprint.pprint(article)
json_article = json.dumps(article, indent=4)
print len(article['comments'].values()), " comments parsed."
print len(article['sentences'].values()), " sentences parsed."


{'comments': {'c0': {'bloggerId': 'epinoa',
                     'parents': [],
                     'sentences_ids': ['s57']},
              'c1': {'bloggerId': 'enfrance ',
                     'parents': ['c0'],
                     'sentences_ids': ['s58']},
              'c10': {'bloggerId': 'dennis79',
                      'parents': [],
                      'sentences_ids': ['s73', 's74', 's75', 's76', 's77']},
              'c11': {'bloggerId': 'SpecialRX',
                      'parents': ['c10'],
                      'sentences_ids': ['s78', 's79']},
              'c12': {'bloggerId': ' Malkatrinho',
                      'parents': ['c10'],
                      'sentences_ids': ['s80', 's81']},
              'c13': {'bloggerId': 'timetorememberagain ',
                      'parents': [],
                      'sentences_ids': ['s82',
                                        's83',
                                        's84',
                                        's85',
                                        's86',
                                        's87',
                                        's88']},
              'c14': {'bloggerId': 'AGrumpyGit ',
                      'parents': ['c13'],
                      'sentences_ids': ['s89']},
              'c15': {'bloggerId': 'timetorememberagain ',
                      'parents': ['c13'],
                      'sentences_ids': ['s90',
                                        's91',
                                        's92',
                                        's93',
                                        's94',
                                        's95']},
              'c16': {'bloggerId': 'SamSSSS',
                      'parents': ['c13'],
                      'sentences_ids': ['s96', 's97', 's98']},
              'c17': {'bloggerId': 'VaughnParadis ',
                      'parents': [],
                      'sentences_ids': ['s99', 's100', 's101', 's102']},
              'c18': {'bloggerId': 'Snapshackle ',
                      'parents': ['c17'],
                      'sentences_ids': ['s103', 's104']},
              'c19': {'bloggerId': ' OurPlanet ',
                      'parents': ['c17'],
                      'sentences_ids': ['s105', 's106', 's107']},
              'c2': {'bloggerId': 'alex442',
                     'parents': ['c0'],
                     'sentences_ids': ['s59', 's60']},
              'c20': {'bloggerId': ' Malkatrinho',
                      'parents': ['c17'],
                      'sentences_ids': ['s108']},
              'c21': {'bloggerId': 'jonbryce ',
                      'parents': ['c17'],
                      'sentences_ids': ['s109', 's110', 's111', 's112']},
              'c22': {'bloggerId': 'IronCurtain ',
                      'parents': [],
                      'sentences_ids': ['s113',
                                        's114',
                                        's115',
                                        's116',
                                        's117',
                                        's118',
                                        's119']},
              'c23': {'bloggerId': 'enfrance ',
                      'parents': ['c22'],
                      'sentences_ids': ['s120',
                                        's121',
                                        's122',
                                        's123',
                                        's124',
                                        's125',
                                        's126',
                                        's127',
                                        's128',
                                        's129',
                                        's130']},
              'c24': {'bloggerId': 'zacmcd ',
                      'parents': ['c22'],
                      'sentences_ids': ['s131', 's132', 's133']},
              'c25': {'bloggerId': ' Haigin88 ',
                      'parents': ['c22'],
                      'sentences_ids': ['s134']},
              'c26': {'bloggerId': 'kaneandabel ',
                      'parents': ['c22'],
                      'sentences_ids': ['s135',
                                        's136',
                                        's137',
                                        's138',
                                        's139']},
              'c27': {'bloggerId': 'kaneandabel ',
                      'parents': ['c22'],
                      'sentences_ids': ['s140']},
              'c28': {'bloggerId': 'ViktorBurakov',
                      'parents': [],
                      'sentences_ids': ['s141', 's142']},
              'c29': {'bloggerId': 'jonwilde ',
                      'parents': ['c28'],
                      'sentences_ids': ['s143', 's144', 's145']},
              'c3': {'bloggerId': 'nitpiqin ',
                     'parents': ['c0'],
                     'sentences_ids': ['s61', 's62']},
              'c30': {'bloggerId': ' Malkatrinho',
                      'parents': ['c28'],
                      'sentences_ids': ['s146', 's147']},
              'c31': {'bloggerId': ' Malkatrinho',
                      'parents': ['c28'],
                      'sentences_ids': ['s148', 's149', 's150']},
              'c32': {'bloggerId': 'ScepticOptimist ',
                      'parents': ['c28'],
                      'sentences_ids': ['s151', 's152', 's153', 's154']},
              'c33': {'bloggerId': 'gwelddycybydd ',
                      'parents': ['c28'],
                      'sentences_ids': ['s155', 's156']},
              'c34': {'bloggerId': 'RandomAccountant ',
                      'parents': [],
                      'sentences_ids': ['s157', 's158']},
              'c35': {'bloggerId': 'allislost ',
                      'parents': ['c34'],
                      'sentences_ids': ['s159']},
              'c36': {'bloggerId': 'GreenRevolution ',
                      'parents': [],
                      'sentences_ids': ['s160', 's161', 's162']},
              'c37': {'bloggerId': 'cockersf1 ',
                      'parents': [],
                      'sentences_ids': ['s163']},
              'c38': {'bloggerId': 'brendon1 ',
                      'parents': ['c37'],
                      'sentences_ids': ['s164', 's165']},
              'c39': {'bloggerId': 'zacmcd ',
                      'parents': ['c37'],
                      'sentences_ids': ['s166', 's167', 's168']},
              'c4': {'bloggerId': 'MirandaKeen ',
                     'parents': ['c0'],
                     'sentences_ids': ['s63']},
              'c40': {'bloggerId': ' TheSandbag',
                      'parents': ['c37'],
                      'sentences_ids': ['s169', 's170']},
              'c41': {'bloggerId': 'nowwhataretheyupto',
                      'parents': [],
                      'sentences_ids': ['s171', 's172', 's173', 's174']},
              'c42': {'bloggerId': 'Miss_Direction ',
                      'parents': [],
                      'sentences_ids': ['s175']},
              'c43': {'bloggerId': 'elevengoalposts ',
                      'parents': ['c42'],
                      'sentences_ids': ['s176', 's177']},
              'c44': {'bloggerId': 'Exodus20',
                      'parents': ['c42'],
                      'sentences_ids': ['s178', 's179']},
              'c45': {'bloggerId': 'Exodus20',
                      'parents': ['c42'],
                      'sentences_ids': ['s180', 's181', 's182']},
              'c46': {'bloggerId': 'Miss_Direction ',
                      'parents': ['c42'],
                      'sentences_ids': ['s183', 's184', 's185', 's186']},
              'c47': {'bloggerId': 'amonduul ',
                      'parents': [],
                      'sentences_ids': ['s187']},
              'c48': {'bloggerId': ' Derek Seymour ',
                      'parents': ['c47'],
                      'sentences_ids': ['s188']},
              'c49': {'bloggerId': 'BStroszek ',
                      'parents': ['c47'],
                      'sentences_ids': ['s189']},
              'c5': {'bloggerId': 'MirandaKeen ',
                     'parents': ['c0'],
                     'sentences_ids': ['s64', 's65']},
              'c6': {'bloggerId': '00jebus ',
                     'parents': [],
                     'sentences_ids': ['s66']},
              'c7': {'bloggerId': 'Bluestone ',
                     'parents': ['c6'],
                     'sentences_ids': ['s67', 's68']},
              'c8': {'bloggerId': 'jonbryce ',
                     'parents': ['c6'],
                     'sentences_ids': ['s69', 's70']},
              'c9': {'bloggerId': 'DavidMillipede ',
                     'parents': ['c6'],
                     'sentences_ids': ['s71', 's72']}},
 'sentences': {'s0': u'BT and Vodafone among telecoms companies passing details to GCHQ',
               's1': u"Some of the world 's leading telecoms firms , including BT and Vodafone , are secretly collaborating with Britain 's spy agency GCHQ , and are passing on details of their customers ' phone calls , email messages and Facebook entries , documents leaked by the whistleblower Edward Snowden show . ",
               's10': u'It gives top secret codenames for each firm , with BT ( " Remedy " ) , Verizon Business ( " Dacron " ) , and Vodafone Cable ( " Gerontic" ) . ',
               's100': u'Best way to keep your information secure ? ',
               's101': u'Use Vodafone . ',
               's102': u"You wo n't be in contact with anybody . ",
               's103': u'[Snapshackle ] Agreed , 4G? ? ',
               's104': u'3G would be nice , or even a fucking signal . ',
               's105': u'[ OurPlanet ] That really made me laugh in a serious situation . ',
               's106': u'Thanks VaughnParadisThe best antidote to this crap is to make them irrelevant and stupid looking . ',
               's107': u'Fear only feeds these imbeciles egos . ',
               's108': u"[ Malkatrinho] that 's a lovely shade of green . ",
               's109': u'[jonbryce ] This is the former Cable & Wireless division of Vodafone . ',
               's11': u'The other firms include Global Crossing ( " Pinnage " ) , Level 3 ( " Little " ) , Viatel ( "Vitreous " ) and Interoute ( " Streetcar " ) . ',
               's110': u'Lots of ISPs use them for their transatlantic traffic . ',
               's111': u"Those that do n't use one of the other ones listed . ",
               's112': u'Level 3 is probably the biggest . ',
               's113': u'[IronCurtain ] The Governments and the Corporations banding together . ',
               's114': u'cui bonno ? ',
               's115': u'you ? ',
               's116': u'me ? ',
               's117': u'Freedom ? ',
               's118': u'Liberty ? ',
               's119': u'not likely . ',
               's12': u'The companies refused to comment on any specifics relating to Tempora , but several noted they were obliged to comply with UK and EU law . ',
               's120': u'[enfrance ] If apathy is the order of the day here , perhaps pionting out that Vodaphone have cost each invidual in the UK so much as a result of its tax deal would that make people wake up ? ',
               's121': u'Nah , I despair . ',
               's122': u'And the likelyhood of anyone boycotting either BT or Vodaphone is nill except perhaps a few who will make no difference at all . ',
               's123': u'How about a petition of some sort ? ',
               's124': u"Its late and I 'm too tired and hot to think of one that could do the subject justice . ",
               's125': u'Anyone ? ',
               's126': u'As for people thinking this does not affect them . ',
               's127': u"Even if you think you are doing nothing wrong what 's to stop you being framed for something by this information ? ",
               's128': u'or mistaken for someone else or regarded as doing something which may be the first step to terrorism , like searching for pressure cookers on the net , buying a rucksack on the net and having a son with a lively interest in online news ? ',
               's129': u'Just ask the New York family elsewhere in the Guardian . ',
               's13': u'The revelations are likely to dismay GCHQ and Downing Street , who are fearful that BT and the other firms will suffer a backlash from customers furious that their private data and intimate emails have been secretly passed to a government spy agency . ',
               's130': u'Perhaps we should all stop reading the news online just in case ! ',
               's131': u'[zacmcd ] There is a petition to Stop Tempora , it takes a minute to sign ( but a confirmation email needs to be clicked ) . ',
               's132': u'Also , minimise the use of these companies ; give them less money to undermine us with . ',
               's133': u'You can make free 2048 bit encrypted VoIP calls using Yuilop save money and lower their turnover ... ',
               's134': u'[ Haigin88 ] &quot ;Fascism should rightly be called Corporatism , as it is the merger of corporate and government power .&quot ; . Benito Mussolini ',
               's135': u'[kaneandabel ] TThe question is when will the poeple fill the street like it happened before Iraq war , a million march in London would be a good idea now . ',
               's136': u'Of course then we may have a false flag operation and then clamp on the screws ... with the orange alert worldwide , made just a day back by Gen Alexanders team . ',
               's137': u'But otherwise , I see no chance of change . ',
               's138': u'In Germany the protests are happening in ernst . ',
               's139': u'Thousands fill German streets to protest Berlin\u2019s NSA spying involvement ',
               's14': u'In June a source with knowledge of intelligence said the companies had no choice but to co-operate in this operation . ',
               's140': u'[kaneandabel ] ; ) I deserved that one ',
               's141': u'[ViktorBurakov] To begin with I thought it all meant that they were paranoid , perpetually terrified of some bogeyman , imaginary or real . ',
               's142': u"Now though , I 'm convinced it 's something a lot more sinister ",
               's143': u'[jonwilde ] Which , I guess , means that the terrorists have won . ',
               's144': u"I was n't nervous at all before I learnt how closely we were being monitored . ",
               's145': u'What are those odds again against being a victim of a terrorist act ? ',
               's146': u"[ Malkatrinho] I think it 's more to do with incompetence and leakiness than it is part of some grand dastardly plan to provoke people . ",
               's147': u"They 'd much rather we stayed ignorant , compliant and biddable . ",
               's148': u'[ Malkatrinho] They are terrified . ',
               's149': u'Of what happens when enough people begin to protest about corruption , inequality , collapsing civil society , etc . ',
               's15': u'They are forbidden from revealing the existence of warrants compelling them to allow GCHQ access to the cables . ',
               's150': u"They 're terrified that the same kind of mass protests that swept Brazil , Turkey , Bulgaria or the Middle East will eventually be provoked in the West . ",
               's151': u'[ScepticOptimist ] And it is exactly the only way we can regain our freedom - a British spring ; mass protest & civil disobedience . ',
               's152': u'Then we will see these fuckers true colours when they send the troops in an people are dragged bleeding off the streets . ',
               's153': u'But they can not win if we all unite against them . ',
               's154': u'History has shown time and time again that true power lies in the hands of the people . ',
               's155': u'[gwelddycybydd ] I do believe that the government is trying to provoke confrontation.Their security forces ; state and private , are well trained and very well equipped.The ',
               's156': u"drip , drip , drip of corruption , cronyism , incompetence and naked self interest and frequently exposed by their 'friends ' in the mostly right leaning media seem to be designed to build pressure in the populace .Maybe , they want to put us in our place , once and for all . ",
               's157': u'[RandomAccountant ] Welcome to 21st century Britain.Big ',
               's158': u'Brother has arrived .29 years late . ',
               's159': u'[allislost ] Been like this for an age , only now many more understand the plight of the many who have been hounded into submission and/or mental institutions . ',
               's16': u"Together , these seven companies operate a huge share of the high-capacity undersea fibre-optic cables that make up the backbone of the internet 's architecture . ",
               's160': u'[GreenRevolution ] I am going to boycott both ! ',
               's161': u'This disgraceful spying business has no end ! ',
               's162': u'Shame on those who participate in this abhorent snooping . ',
               's163': u'[cockersf1 ] Do you think this qould be a way of getting out of contracts with these companies ? ',
               's164': u"[brendon1 ] The problem is that most of the world 's Internet traffic goes via Level3 . ",
               's165': u"There 's no easy way of getting around that . ",
               's166': u'[zacmcd ] Hard for the layman to know . ',
               's167': u'SSL being secure would explain why they need backdoor access via Prism . ',
               's168': u'There is also mention that encrypted mobile messaging apps pose a threat to their capabilities ... ',
               's169': u"[ TheSandbag] From the descriptions of tempora I 'm fairly sure they have broken SSL otherwise its pointless . ",
               's17': u"GCHQ 's mass tapping operation has been built up over the past five years by attaching intercept probes to the transatlantic cables where they land on British shores . ",
               's170': u"Large numbers of web apps and traffic is ssl encrypted now so why bother creating a massive fiber intercept program if you could n't see 60 % + of all the traffic and almost all of the communications data ? ",
               's171': u'[nowwhataretheyupto] so this is why the government go easy on these toss pots not paying corporation tax . ',
               's172': u'And they dare to call bank robbers criminals . ',
               's173': u"Jesus , there 's no f*cking morals with any of them . ",
               's174': u"What 's happened to these people , or are they so smacked off their tits to care ? . ",
               's175': u'[Miss_Direction ] There are four points i would like to raise : 1 ) Even if the security services can be trusted with all the vast data they gather what happens if it falls into the wrong hands 2 ) GCHQ have been known in the past to target animals rights protestors , environmental organisations and Union groups therefore it is clear that they are already used as a political weapon against dissent 3 ) Democracy depends on transparency and accountability and this important element is being undermined at the highest levels for the benefit of corporate and political interests 4 ) Lastly i would like to know if the NSA and GCHQ actively targeted the Occupy movement and who authorised this action and what was their remit ',
               's176': u"[elevengoalposts ] ''GCHQ have been known in the past to target animals rights protestors , environmental organisations and Union groups .. . ",
               's177': u"'' You have first-hand , written evidence about that , or just assertions ? ",
               's178': u'[Exodus20] Rights and laws are at best ideals , no more , no less . ',
               's179': u'To those corrupted by greed and power , these are nothing more than sweets to quieten gullible and innocent children . ',
               's18': u"GCHQ 's station in Bude , north Cornwall , plays a role . ",
               's180': u'[Exodus20] Even if the security services can be trusted with all the vast data they gather what happens if it falls into the wrong hands I share that sentiment and concern . ',
               's181': u'Security services are necessary but i am afraid most , even all of them , sooner rather than later will become rogue , cancerous , arrogant corrupt and become the tools of suppression for politicians and big money . ',
               's182': u'Most people do not appreciate the preciousness and precariousness of accountable democracy and responsibility freedom . ',
               's183': u'[Miss_Direction ] The right to freedom of expression is recognized as a human right under Article 19 of the Universal Declaration of Human Rights and recognized in international human rights law in the International Covenant on Civil and Political Rights ( ICCPR ) . ',
               's184': u'Article 19 of the ICCPR states that &quot ;everyone shall have the right to hold opinions without interference&quot ; and &quot ;everyone shall have the right to freedom of expression ; this right shall include freedom to seek , receive and impart information and ideas of all kinds , regardless of frontiers , either orally , in writing or in print , in the form of art , or through any other media of his choice&quot ; http ://en.wikipedia ',
               's185': u'.org/wiki/ ',
               's186': u'Freedom_of_speech Our basic right to freedom of speech should be upheld through international law as long as it does not harm or interfere with the liberty of others ... it therefore appears that we have lost sight of what is an important and we have a problem that requires fixing . ',
               's187': u'[amonduul ] Ignore shockman - rightwing troll . ',
               's188': u"[ Derek Seymour ] This is not a right or left issue , so I 'm calling shockman a fascist troll ",
               's189': u"[BStroszek ] If you 're tired , why do n't you fuck off to bed , and spare us your drivel . ",
               's19': u'The cables carry data to western Europe from telephone exchanges and internet servers in north America . ',
               's2': u'BT , Vodafone Cable , and the American firm Verizon Business \u2013 together with four other smaller providers \u2013 have given GCHQ secret unlimited access to their network of undersea cables . ',
               's20': u'This allows GCHQ and NSA analysts to search vast amounts of data on the activity of millions of internet users . ',
               's21': u'Metadata \u2013 the sites users visit , whom they email , and similar information \u2013 is stored for up to 30 days , while the content of communications is typically stored for three days . ',
               's22': u'GCHQ has the ability to tap cables carrying both internet data and phone calls . ',
               's23': u'By last year GCHQ was handling 600m "telephone events " each day , had tapped more than 200 fibre-optic cables and was able to process data from at least 46 of them at a time . ',
               's24': u'Each of the cables carries data at a rate of 10 gigabits per second , so the tapped cables had the capacity , in theory , to deliver more than 21 petabytes a day \u2013 equivalent to sending all the information in all the books in the British Library 192 times every 24 hours . ',
               's25': u'This operation is carried out under clandestine agreements with the seven companies , described in one document as "intercept partners " . ',
               's26': u'The companies are paid for logistical and technical assistance . ',
               's27': u'The identity of the companies allowing GCHQ to tap their cables was regarded as extremely sensitive within the agency . ',
               's28': u'Though the Tempora programme itself was classified as top secret , the identities of the cable companies was even more secret , referred to as "exceptionally controlled information " , with the company names replaced with the codewords , such as " GERONTIC" , " REMEDY" and " PINNAGE" . ',
               's29': u'However , some documents made it clear which codenames referred to which companies . ',
               's3': u"The cables carry much of the world 's phone calls and internet traffic . ",
               's30': u'GCHQ also assigned the firms " sensitive relationship teams " . ',
               's31': u'One document warns that if the names emerged it could cause "high-level political fallout " . ',
               's32': u'Germans have been enraged by the revelations of spying by the National Security Agency and GCHQ after it emerged that both agencies were hoovering up German data as well . ',
               's33': u'On Friday the S\xfcddeutsche said it was now clear that private telecoms firms were far more deeply complicit in US-UK spying activities than had been previously thought . ',
               's34': u'The source familiar with intelligence maintained in June that GCHQ was " not looking at every piece of straw " but was sifting a "vast haystack of data " for what he called "needles " . ',
               's35': u'He added : " If you had the impression we are reading millions of emails , we are not . ',
               's36': u'There is no intention in this whole programme to use it for looking at UK domestic traffic \u2013 British people talking to each other . ',
               's37': u'" The source said analysts used four criteria for determining what was examined : security , terror , organised crime and Britain \'s economic wellbeing ." ',
               's38': u"The vast majority of the data is discarded without being looked at \u2026 we simply do n't have the resources . ",
               's39': u'" Nonetheless , the agency repeatedly referred to plans to expand this collection ability still further in the future . ',
               's4': u"In June the Guardian revealed details of GCHQ 's ambitious data-hoovering programmes , Mastering the Internet and Global Telecoms Exploitation , aimed at scooping up as much online and telephone traffic as possible . ",
               's40': u'Once it is collected , analysts are able to search the information for emails , online chats and browsing histories using an interface called XKeyscore , uncovered in the Guardian on Wednesday . ',
               's41': u'By May 2012 , 300 analysts from GCHQ and 250 NSA analysts had direct access to search and sift through the data collected under the Tempora program . ',
               's42': u'Documents seen by the Guardian suggest some telecoms companies allowed GCHQ to access cables which they did not themselves own or operate , but only operated a landing station for . ',
               's43': u'Such practices could raise alarm among other cable providers who do not co-operate with GCHQ programmes that their facilities are being used by the intelligence agency . ',
               's44': u'Telecoms providers can be compelled to co-operate with requests from the government , relayed through ministers , under the 1984 Telecommunications Act , but privacy advocates have raised concerns that the firms are not doing enough to challenge orders enabling large-scale surveillance , or are co-operating to a degree beyond that required by law . ',
               's45': u'" We urgently need clarity on how close the relationship is between companies assisting with intelligence gathering and government , " said Eric King , head of research for Privacy International . ',
               's46': u'" Were the companies strong-armed , or are they voluntary intercept partners ? ',
               's47': u'" Vodafone said it complied with the laws of all the countries in which its cables operate . ',
               's48': u'" Media reports on these matters have demonstrated a misunderstanding of the basic facts of European , German and UK legislation and of the legal obligations set out within every telecommunications operator \'s licence \u2026 Vodafone complies with the law in all of our countries of operation , " said a spokesman . ',
               's49': u'" Vodafone does not disclose any customer data in any jurisdiction unless legally required to do so . ',
               's5': u'It emerged GCHQ was able to tap into fibre-optic cables and store huge volumes of data for up to 30 days . ',
               's50': u'Questions related to national security are a matter for governments not telecommunications operators . ',
               's51': u'" A spokeswoman for Interoute said : " As with all communication providers in Europe we are required to comply with European and local laws including those on data protection and retention . ',
               's52': u'From time to time we are presented with requests from authorities . ',
               's53': u'When we receive such requests , they are processed by our legal and security teams and if valid , acted upon . ',
               's54': u'" A spokeswoman for Verizon said : " Verizon continually takes steps to safeguard our customers \' privacy . ',
               's55': u'Verizon also complies with the law in every country in which we operate . ',
               's56': u'" BT declined to comment . ',
               's57': u'[epinoa] So BT and Vodafone breached the Data Protection Act ? ',
               's58': u'[enfrance ] Is this Atlantic Bridge in operation ? ',
               's59': u'[alex442] Human Rights ? ',
               's6': u'That operation , codenamed Tempora , has been running for 20 months . ',
               's60': u'Aww , how quaint ... ',
               's61': u"[nitpiqin ] That 's what we need to judge them and prosecute them with their own laws . ",
               's62': u'They are breaking their own laws . ',
               's63': u'[MirandaKeen ] In the near future , I hope . ',
               's64': u'[MirandaKeen ] And the Universal Declaration of Human RightsArticle 12No one shall be subjected to arbitrary interference with their privacy , family , home or correspondence , nor to attacks upon their honour and reputation . ',
               's65': u'Everyone has the right to the protection of the law against such interference and attacks . ',
               's66': u"[00jebus ] So THATs how Vodafone get away with not paying any tax ( allegedly ) hmmm .... is there a telecom company that does n't pass on infomation ? ",
               's67': u"[Bluestone ] You scratch our backs , we 'll scratch yours - and bugger the proles ! ",
               's68': u'Haw haw ! ',
               's69': u'[jonbryce ] Vodafone Cable was Cable & Wireless until they took it over on 27 July 2012 . ',
               's7': u"On Friday Germany 's S\xfcddeutsche newspaper published the most highly sensitive aspect of this operation \u2013 the names of the commercial companies working secretly with GCHQ , and giving the agency access to their customers ' private communications . ",
               's70': u'The tax settlement was in 2010 , so there is no link . ',
               's71': u"[DavidMillipede ] Probably not given that they have to comp'ly with secret government demands . ",
               's72': u"We need a revolution , and we need it now , to restore democracy to the people and not elitist wankers like this shabby government who could n't even win an election , yet take all the liberties they have been with public services . ",
               's73': u'[dennis79] All of this is certainly not done to combat terrorism . ',
               's74': u'Spying upon politics , economics , NGOs etc . ',
               's75': u"I 'm also pretty confident that a number of individuals are subjugated to more thorough investigation purely for state needs . ",
               's76': u'States have always set up people , used people as fall guys , coerced and manipulated their subjects . ',
               's77': u"All this is far easier when in the possession of a very complete knowledge of someone 's life . ",
               's78': u"[SpecialRX] I suspect you 're right . ",
               's79': u'Have a recommend on me . ',
               's8': u'The paper said it had seen a copy of an internal GCHQ powerpoint presentation from 2009 discussing Tempora . ',
               's80': u"[ Malkatrinho] It has nothing to do with &quot;Terrorists&quot ; , you 're bang on . ",
               's81': u"It 's about keeping an eye on the general populace , which might start getting a little &quot ;uppity&quot ; when the consequences of climate change and economic collapse really start to kick in in around 10-15 years time ( if we 're being optimistic ) . ",
               's82': u"[timetorememberagain ] A spokeswoman for Verizon said : &quot ;Verizon continually takes steps to safeguard our customers ' privacy . ",
               's83': u'Verizon also complies with the law in every country in which we operate .&quot ; This is a contradiction . ',
               's84': u"It sounds like they 're doing everything just right but in fact the two statements are mutually exclusive because They are forbidden from revealing the existence of warrants compelling them to allow GCHQ access to the cables . ",
               's85': u"So the government demands access to my ( and everyone else 's ) phone calls and the provider obeys without telling me but instead assures me they 're continually taking steps to safeguard my privacy . ",
               's86': u'Sheer doublespeak and deception of the highest order . ',
               's87': u'Recall Parliament now ! ',
               's88': u'Demonstrate now ! ',
               's89': u"[AGrumpyGit ] Democracy does indeed work that way , but we 've living in a quasi-democracy for som time now , in which we all go through a meaningless but apparently necessary election system but we always end up with &quot ;leaders&quot ; that perpuate their hidden agenda . ",
               's9': u'The document identified for the first time which telecoms companies are working with GCHQ \'s " special source " team . ',
               's90': u'[timetorememberagain ] It should have been debated in parliament.If the majority were in favour , the access would be granted.That is how democracy works . ',
               's91': u'Quite so . ',
               's92': u'Those among us who might previously have argued that at least we still live in a democracy , might now begin to question that assumption . ',
               's93': u'With luck we might even stage mass demonstrations demanding our rights . ',
               's94': u'Organise ! ',
               's95': u'Resist ! ',
               's96': u"[SamSSSS] I do n't see why the existence of warrants for access had to be hidden from the public. ",
               's97': u'It should have been debated in parliament . ',
               's98': u'If the majority were in favour , the access would be granted.That is how democracy works . ',
               's99': u"[VaughnParadis ] I 'd be concerned about Vodafone passing on details of my communications , if Vodafone could provide me with a signal in the first place . "}}
50  comments parsed.
190  sentences parsed.

Clustering just the sentences

Vectorizing the sentences (TFIDF)


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem

english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer=super(StemmedTfidfVectorizer,self).build_analyzer()
        return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english',
                                    )

sentences_vectors = vectorizer.fit_transform(article['sentences'].values())
sorted_feature_indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n_features = 20
top_features = [features[i] for i in sorted_feature_indices[:top_n_features]]

print "%d features found" % (len(features))
print "Top %d features:" % (top_n_features)
print top_features


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-2b18684ea7fc> in <module>()
     13 
     14 sentences_vectors = vectorizer.fit_transform(article['sentences'].values())
---> 15 sorted_feature_indices = np.argsort(vectorizer.idf_)[::-1]
     16 features = vectorizer.get_feature_names()
     17 top_n_features = 20

NameError: name 'np' is not defined

Dimensionality reduction and Normalization


In [ ]:
import gensim
#Dimensionality reduction using LSI. Go from 6D to 2D.

X = sentences_vectors.todense()
dct = gensim.corpora.Dictionary(X)
lsi_docs = {}
num_topics = 500
lsi_model = gensim.models.LsiModel(dct, num_topics=500)
print lsi_model.shape
print lsi_model[:50]

Clustering with MeanShift

WHY ARE ALL VECTORS VALUED AT 0!???


In [12]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

bandwidth = estimate_bandwidth(X, quantile=0.3)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print "Number of estimated clusters : %d" % n_clusters_

# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o',
            markerfacecolor=col, markeredgecolor='k',
            markersize=14)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-6fe3eeed26f8> in <module>()
      2 from sklearn.cluster import MeanShift, estimate_bandwidth
      3 
----> 4 bandwidth = estimate_bandwidth(X, quantile=0.3)
      5 
      6 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)

NameError: name 'X' is not defined

Using the same approach as a movie clusterer

http://brandonrose.org/clustering

Imports


In [13]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

Stopwords, stemming, and tokenizing


In [14]:
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
print 'Done'


Done

In [15]:
def tokenize_and_stem(sentences):
    tokens = [word for sent in sentences 
              for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(sentences):
    tokens = [word.lower() for sent in sentences
              for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

Make vocabulary

stemmmed and not-stemmed


In [16]:
totalvocab_stemmed = []
totalvocab_tokenized = []
allwords_stemmed = tokenize_and_stem(article['sentences'].values())
totalvocab_stemmed.extend(allwords_stemmed)

allwords_tokenized = tokenize_only(article['sentences'].values())
totalvocab_tokenized.extend(allwords_tokenized)

Pandas data frame to visualize the vocabulary


In [17]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized},
                          index = totalvocab_stemmed)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'
print 'here are the first words in the vocabulary'
vocab_frame.head()


there are 3352 items in vocab_frame
here are the first words in the vocabulary
Out[17]:
words
to to
those those
corrupt corrupted
by by
greed greed

TF-IDF and document similarity


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=20000,
                                  min_df=0.2, stop_words='english',
                                  use_idf=True, tokenizer=tokenize_and_stem,
                                  ngram_range=(1,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(article['sentences'].values())

print tfidf_matrix.shape


CPU times: user 2.11 s, sys: 17.7 ms, total: 2.12 s
Wall time: 2.13 s
(190, 120)

In [19]:
terms = tfidf_vectorizer.get_feature_names()

Cosine Similarity


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
dist_frame = pd.DataFrame(dist)
print dist


[[  0.00000000e+00   6.32368721e-01   4.41392655e-01 ...,   5.46565074e-01
    4.36723114e-01   5.89894745e-01]
 [  6.32368721e-01   0.00000000e+00   6.96169781e-01 ...,   6.60962208e-01
    6.80822683e-01   7.49110606e-01]
 [  4.41392655e-01   6.96169781e-01  -2.22044605e-16 ...,   5.77924745e-01
    4.60548146e-01   6.25575149e-01]
 ..., 
 [  5.46565074e-01   6.60962208e-01   5.77924745e-01 ...,   3.33066907e-16
    3.86541760e-01   4.06016946e-01]
 [  4.36723114e-01   6.80822683e-01   4.60548146e-01 ...,   3.86541760e-01
   -2.22044605e-16   4.31044747e-01]
 [  5.89894745e-01   7.49110606e-01   6.25575149e-01 ...,   4.06016946e-01
    4.31044747e-01   0.00000000e+00]]

K-means clustering


In [21]:
from sklearn.cluster import KMeans

num_clusters = 5
km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()


CPU times: user 112 ms, sys: 2.98 ms, total: 115 ms
Wall time: 119 ms

In [22]:
clusters


Out[22]:
[3,
 0,
 1,
 4,
 1,
 1,
 1,
 1,
 3,
 3,
 2,
 4,
 4,
 4,
 4,
 3,
 2,
 4,
 3,
 2,
 4,
 0,
 1,
 3,
 4,
 4,
 0,
 3,
 1,
 4,
 0,
 0,
 0,
 0,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 2,
 4,
 4,
 2,
 2,
 4,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 4,
 2,
 3,
 4,
 1,
 4,
 4,
 3,
 2,
 4,
 2,
 4,
 4,
 2,
 3,
 4,
 3,
 4,
 1,
 1,
 4,
 2,
 3,
 3,
 0,
 3,
 0,
 4,
 0,
 0,
 4,
 3,
 4,
 1,
 0,
 1,
 2,
 2,
 0,
 1,
 3,
 3,
 0,
 0,
 4,
 4,
 4,
 2,
 1,
 1,
 2,
 0,
 1,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 1,
 2,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 3,
 4,
 4,
 1,
 3,
 4,
 4,
 3,
 0,
 3,
 3,
 3,
 3,
 3,
 2,
 4,
 1,
 2,
 4,
 3,
 3,
 1,
 2,
 1,
 4,
 2,
 4,
 1,
 3,
 1,
 4,
 2,
 4,
 4,
 4,
 4,
 4,
 2,
 4,
 4,
 1,
 1,
 4,
 0,
 1,
 3,
 2,
 4,
 3,
 0,
 4,
 4,
 4,
 4,
 4,
 4,
 1,
 2,
 4,
 4]

Multidimensional scaling to plot?


In [23]:
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS

MDS()
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)

xs, ys = pos[:,0], pos[:, 1]

Plot


In [24]:
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}
cluster_names = {0: 'C0',
                 1: 'C1',
                 2: 'C2',
                 3: 'C3',
                 4: 'C4'}

# iPython now will show matplotlib plots inline
%matplotlib inline

df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=["s{0}".format(x) for x in range(190)]))

groups = df.groupby('label')

### set up the plot
fig, ax = plt.subplots(figsize=(17,9))
ax.margins(0.05)
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='',
           ms=12, label=cluster_names[name], color=cluster_colors[name],
           mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis='x',
        which='both',
        bottom='off',
        top='off',
        labelbottom='off')
    ax.tick_params(\
        axis='y',
        which='both',
        left='off',
        top='off',
        labelleft='off')
ax.legend(numpoints=1)

for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'],
           size=8)
    
plt.show()



In [25]:
print article['sentences']['s151']
print article['sentences']['s170']
print article['sentences']['s171']
print article['sentences']['s108']


[ScepticOptimist ] And it is exactly the only way we can regain our freedom - a British spring ; mass protest & civil disobedience . 
Large numbers of web apps and traffic is ssl encrypted now so why bother creating a massive fiber intercept program if you could n't see 60 % + of all the traffic and almost all of the communications data ? 
[nowwhataretheyupto] so this is why the government go easy on these toss pots not paying corporation tax . 
[ Malkatrinho] that 's a lovely shade of green . 

In [26]:
print article['sentences']['s93']
print article['sentences']['s150']
print article['sentences']['s114']
print article['sentences']['s110']


With luck we might even stage mass demonstrations demanding our rights . 
They 're terrified that the same kind of mass protests that swept Brazil , Turkey , Bulgaria or the Middle East will eventually be provoked in the West . 
cui bonno ? 
Lots of ISPs use them for their transatlantic traffic . 

Hierarchical document clustering

The Ward clustering algorithm !!!!


In [27]:
from scipy.cluster.hierarchy import ward, dendrogram
linkage_matrix = ward(dist) #define the linkage_matrix
# using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15,20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=["s{0}".format(x) for x in range(190)])

plt.tick_params(\
               axis = 'x',
               which ='both',
               bottom ='off',
               top = 'off',
               labelbottom = 'off')

plt.tight_layout()

plt.savefig('ward_clusters.png', dpi=200)



In [28]:
frame = pd.DataFrame(linkage_matrix)
frame.sort_values(2,axis=0, ascending=False)


Out[28]:
0 1 2 3
188 376 377 21.858239 190
187 369 375 13.101199 87
186 356 373 7.130920 103
185 372 374 5.770700 70
184 358 371 4.347340 46
183 335 370 3.707320 67
182 308 368 3.510684 24
181 359 362 3.493877 29
180 361 366 3.393611 52
179 222 367 3.253098 17
178 348 364 3.140669 21
177 357 365 3.131984 15
176 345 363 2.976825 38
175 350 352 2.945812 10
174 354 360 2.880574 16
173 347 349 2.870923 23
172 333 341 2.816104 14
171 336 343 2.766368 14
170 330 344 2.706653 8
169 324 353 2.691439 15
168 340 355 2.666661 17
167 276 342 2.651612 5
166 322 328 2.631879 36
165 285 351 2.508269 13
164 316 337 2.437192 8
163 332 334 2.337388 11
162 331 338 2.311653 5
161 318 329 2.281644 10
160 311 346 2.249182 5
159 280 303 2.199540 10
... ... ... ... ...
29 170 171 1.132782 2
28 6 201 1.127395 3
27 14 173 1.126105 2
26 92 133 1.108976 2
25 129 153 1.108879 2
24 17 42 1.105053 2
23 97 131 1.101264 2
22 38 39 1.073012 2
21 77 90 1.051660 2
20 198 202 1.043855 5
19 107 155 1.040418 2
18 80 185 1.029425 2
17 136 152 1.028330 2
16 35 164 1.018026 2
15 16 75 1.015498 2
14 169 181 1.007137 2
13 67 158 1.002289 2
12 43 192 0.969303 3
11 74 118 0.961551 2
10 3 148 0.959468 2
9 126 127 0.955704 2
8 34 40 0.926995 2
7 66 156 0.924832 2
6 79 186 0.902000 2
5 85 91 0.881387 2
4 41 115 0.875310 2
3 49 113 0.842472 2
2 166 188 0.820532 2
1 12 117 0.716755 2
0 13 116 0.688414 2

189 rows × 4 columns

Extracting the links


In [65]:
soup = BeautifulSoup(article_text, "lxml")
def is_valid_link(tag):
    if tag.name != 'link':
        return False
    link = tag
    l_conf = link['link_confidence']
    l_val = link['validation']
    arg = link.find_next_sibling('argument')
    sent = link.find_next_sibling('sentiment')
    a_val = arg['validation']
    s_val = sent['validation']
    a_conf = arg['val_confidence']
    s_conf = sent['val_confidence']
    args = [l_val, a_val, s_val, l_conf, a_conf, s_conf]
    return all(el == '1' or el == 'yes' for el in args)

linksHTML = soup.findAll(lambda tag:is_valid_link(tag))

print len(linksHTML), "valid links found!"

parsed_links = []
for link_html in linksHTML:
    arg_html = link_html.find_next_sibling('argument')
    sent_html = link_html.find_next_sibling('sentiment')
    link = {}
    link['id'] = link_html['id']
    link['art_sentence'] = link_html['art_sentence']
    link['com_sentence'] = link_html['com_sentence']
    link['confidence'] = link_html['link_confidence']
    link['validation'] = link_html['validation']
    
    arg = {}
    arg['label'] = arg_html['label']
    arg['confidence'] = arg_html['val_confidence']
    arg['validation'] = arg_html['validation']
    
    sent = {}
    sent['label'] = sent_html['label']
    sent['confidence'] = sent_html['val_confidence']
    sent['validation'] = sent_html['validation']
    
    link['argument'] = arg
    link['sentiment'] = sent
    parsed_links.append(link)

# pprint.pprint(parsed_links, indent=4)
print len(parsed_links),"links parsed!"


140 valid links found!
140 links parsed!

In [ ]: