notebook.community

Edit and run



In [1]:

    
class OrderedList(object):
    def __init__(self):
        self.nums = []
        
    def append(self, num):
        if len(self.nums) == 0:
            self.nums.append(num)
        else:
            i = 0
            while i < len(self.nums):
                if self.nums[i] > num:
                    break
                i += 1
                
            self.nums.insert(i, num)
        
    def show(self):
        for num in self.nums:
            print num,
        
    def find(self, num):
        return num in self.nums
    
    
ordered_list = OrderedList()
ordered_list.append(9)
ordered_list.append(4)
ordered_list.append(3)
ordered_list.append(2)
ordered_list.append(7)
ordered_list.append(1)
ordered_list.append(5)

ordered_list.show()









    



1 2 3 4 5 7 9



In [2]:

    
import os
from collections import defaultdict
from collections import OrderedDict

class TFIDFCaculator(object):
    def __init__(self, path):
        self.path = path
        self.tf_dict = {}
        self.df_dict = defaultdict(int)

    def build_tf_dict(self):
        for root, dirs, files in os.walk(self.path):
            for f in files:
                self.tf_dict[f] = defaultdict(int)
                with open(os.path.join(self.path, f), 'r') as fout:
                    data = fout.read()
                    words = [word.lower() for word in data.split()] 

                    for w in words:
                        self.tf_dict[f][w] += 1
                        
    def build_df_dict(self):
        all_keys = list(set([key for doc in self.tf_dict for key in self.tf_dict[doc]]))
        for key in all_keys:
            for root, dirs, files in os.walk(self.path):
                for f in files:
                    with open(os.path.join(self.path, f)) as fout:
                        data = fout.read()
                        words = [word.lower() for word in data.split()]  
                        if key in words:
                            self.df_dict[key] += 1
                            
    def get_tf_idf_dict(self, reverse = True):
        tf_idf_dict = {}
        
        for doc in self.tf_dict:
            tf_idf_dict[doc] = {}
            for key, value in self.tf_dict[doc].items():
                tf_idf_dict[doc][key] = float(value) / self.df_dict[key]
        
        for doc in tf_idf_dict:
            tf_idf_dict_by_doc = tf_idf_dict[doc]
            tf_idf_dict_by_doc = OrderedDict(sorted(tf_idf_dict_by_doc.items(), key = lambda x : x[1], reverse = reverse))
            tf_idf_dict[doc] = tf_idf_dict_by_doc
        return tf_idf_dict
            
            
path = os.getcwd() + '/data_set'            
tf_idf = TFIDFCaculator(path)            

tf_idf.build_tf_dict()
tf_idf.build_df_dict()

#print tf_idf.df_dict

for doc, tf_idf_dict in tf_idf.get_tf_idf_dict().items():
    print doc 
    print tf_idf_dict



In [3]:

    
a = 'abcdef\n'
print a

# raw string
b = r'abcdef\n'
print b









    



abcdef

abcdef\n



In [4]:

    
import re

match = re.search(r'iii', 'piiig')
print match
print match.group()

match = re.search(r'iiiig', 'piiig')
print match









    



<_sre.SRE_Match object at 0x03BBC608>
iii
None



In [5]:

    
m = re.search(r'..g', 'piiig')
print m.group()

m = re.search(r'\d\d\d', 'p123g') 
print m.group()

m = re.search(r'\d\d\d', '오마이갓123이럴수가') 
print m.group()

m = re.search(r'\w\w\w', '@@abcd!!')
print m.group()

m = re.search(r'\w\w\w', '@@ab0!!')
print m.group()









    



iig
123
123
abc
ab0



In [6]:

    
m = re.search(r'[\w.-]+@[\w.-]+',
              "My email is macmath22@gmail.com")

print m.group(), type(m.group())
print m.groups()









    



macmath22@gmail.com <type 'str'>
()



In [8]:

    
pattern = r'^0\d{2}-\d{3,4}-\d{4}'
m = re.search(pattern, '010-3457-6360')
print m.group()









    



010-3457-6360



In [9]:

    
variables = ['abc', '3dbd', 'a_bdd', 'good344', 'aB_23']

for v in variables:
    m = re.search(r'^[a-zA-Z_]+[\w\d]*', v)
    if m != None:
        print m.group()









    



abc
a_bdd
good344
aB_23



In [10]:

    
str = 'What a nice weather macmath22@gmail.com, test@test.com mina@minas.net'
replaced = re.sub(r'[\w\.-]+@[\w\.-]+', r'test', str) 
print replaced









    



What a nice weather test, test test