Task 2: Identifying words by frequency A bigram is sequence of two consecutive tokens. The previous sentence, for example, contains the bigrams: (a bigram), (bigram is), (is sequence), (sequence of), (of two), (two consecutive), and (consecutive tokens). Across the entire corpus find (1) the top 50 most frequent unigrams (single tokens), and (2) the top 50 most frequent bigrams. The output should be a list of 100 lines, where the first 50 lines contain a single term each, in order of frequency, followed by 50 lines containing two terms each, in order of the bigram frequency.


In [49]:
import collections
import re
import sys
import glob
import os

In [89]:
def tokenize(string):
    
    return re.findall(r'\w+',string.lower())

def count_ngrams(lines,min_length=1,max_length=2):
    
    
    # list of consecutive elements from min_length to max_length
    lengths=range(min_length,max_length+1)
    
    # dictionary of length as key and Counter as value
    # [2: Counter(), 3:Counter(), 4:Counter]
    ngrams={length: collections.Counter() for length in lengths}
    
    # create queue of max length maxlen
    queue=collections.deque(maxlen=max_length)
        
    def add_queue():
        current=tuple(queue)
        for length in lengths:
            if len(current)>= length:
                ngrams[length][current[:length]] += 1
                
    # iterate over all the lines    
    for line in lines:
        for word in tokenize(line):
            queue.append(word)
            if len(queue)>= max_length:
                # after every element over maxlength, queue pops the word from the front
                add_queue()
                
    
    while len(queue)> min_length:
        queue.popleft()
        add_queue()
     
    return ngrams

def print_common_ngrams(ngrams,num=50):
    
    for n in sorted(ngrams):
        for gram,count in ngrams[n].most_common(num):
            string= "{0} {1} ".format(' '.join(gram), count)
            print string

In [90]:
data_folder='./data/'

#content_dict = {}

total_content=[]
for content in os.listdir(data_folder): # "." means current directory
    filepath=data_folder+ content
    lines=[]
    with open(filepath) as f:
        lines=f.readlines()
    f.close()    
    #print lines
    #print content
    #print len(lines)
    for line in lines :
        total_content.append(line)

#len(total_content)

In [91]:
min_length=1
max_length=2

print 'total lines in the data are :  ' , len(total_content)

ngrams=count_ngrams(lines,min_length,max_length)


total lines in the data are :   1712

In [92]:
print print_common_ngrams(ngrams,50)


the 186 
of 75 
in 61 
perovskite 46 
a 45 
to 44 
and 42 
is 37 
for 33 
at 31 
this 30 
conversion 27 
that 27 
circuit 26 
with 23 
short 22 
film 19 
films 19 
iodide 19 
we 19 
devices 18 
as 17 
are 17 
lead 17 
temperature 17 
c 17 
be 17 
immersion 16 
from 15 
crystal 15 
solar 14 
cells 14 
by 14 
when 14 
was 13 
not 12 
performance 12 
layer 12 
can 11 
temperatures 11 
figure 10 
tio2 10 
where 10 
2 10 
solution 10 
on 10 
higher 9 
oriented 9 
supplementary 9 
photovoltaic 9 
of the 26 
in the 24 
the perovskite 22 
short circuit 21 
lead iodide 17 
solar cells 14 
to the 11 
can be 9 
the conversion 9 
the supplementary 9 
60 c 8 
crystal size 8 
circuit currents 8 
perovskite solar 8 
perovskite film 7 
that the 7 
supplementary material 7 
the substrate 7 
in figure 7 
when the 6 
in a 6 
perovskite crystal 6 
circuit current 6 
as the 6 
perovskite films 6 
photovoltaic performance 6 
co workers 5 
planar heterojunction 5 
with the 5 
open circuit 5 
the highest 5 
and the 5 
here that 5 
and co 5 
shown in 5 
power conversion 5 
iodide layer 5 
fully converted 5 
perovskite crystals 5 
deposition conversion 5 
the films 5 
in this 5 
it is 5 
for the 5 
tio2 layer 4 
conversion technique 4 
on the 4 
c the 4 
we show 4 
immersion temperature 4 
None

In [ ]:


In [ ]: