In [1]:
class OrderedList(object):
def __init__(self):
self.nums = []
def append(self, num):
if len(self.nums) == 0:
self.nums.append(num)
else:
i = 0
while i < len(self.nums):
if self.nums[i] > num:
break
i += 1
self.nums.insert(i, num)
def show(self):
for num in self.nums:
print num,
def find(self, num):
return num in self.nums
ordered_list = OrderedList()
ordered_list.append(9)
ordered_list.append(4)
ordered_list.append(3)
ordered_list.append(2)
ordered_list.append(7)
ordered_list.append(1)
ordered_list.append(5)
ordered_list.show()
In [2]:
import os
from collections import defaultdict
from collections import OrderedDict
class TFIDFCaculator(object):
def __init__(self, path):
self.path = path
self.tf_dict = {}
self.df_dict = defaultdict(int)
def build_tf_dict(self):
for root, dirs, files in os.walk(self.path):
for f in files:
self.tf_dict[f] = defaultdict(int)
with open(os.path.join(self.path, f), 'r') as fout:
data = fout.read()
words = [word.lower() for word in data.split()]
for w in words:
self.tf_dict[f][w] += 1
def build_df_dict(self):
all_keys = list(set([key for doc in self.tf_dict for key in self.tf_dict[doc]]))
for key in all_keys:
for root, dirs, files in os.walk(self.path):
for f in files:
with open(os.path.join(self.path, f)) as fout:
data = fout.read()
words = [word.lower() for word in data.split()]
if key in words:
self.df_dict[key] += 1
def get_tf_idf_dict(self, reverse = True):
tf_idf_dict = {}
for doc in self.tf_dict:
tf_idf_dict[doc] = {}
for key, value in self.tf_dict[doc].items():
tf_idf_dict[doc][key] = float(value) / self.df_dict[key]
for doc in tf_idf_dict:
tf_idf_dict_by_doc = tf_idf_dict[doc]
tf_idf_dict_by_doc = OrderedDict(sorted(tf_idf_dict_by_doc.items(), key = lambda x : x[1], reverse = reverse))
tf_idf_dict[doc] = tf_idf_dict_by_doc
return tf_idf_dict
path = os.getcwd() + '/data_set'
tf_idf = TFIDFCaculator(path)
tf_idf.build_tf_dict()
tf_idf.build_df_dict()
#print tf_idf.df_dict
for doc, tf_idf_dict in tf_idf.get_tf_idf_dict().items():
print doc
print tf_idf_dict
In [3]:
a = 'abcdef\n'
print a
# raw string
b = r'abcdef\n'
print b
In [4]:
import re
match = re.search(r'iii', 'piiig')
print match
print match.group()
match = re.search(r'iiiig', 'piiig')
print match
In [5]:
m = re.search(r'..g', 'piiig')
print m.group()
m = re.search(r'\d\d\d', 'p123g')
print m.group()
m = re.search(r'\d\d\d', '오마이갓123이럴수가')
print m.group()
m = re.search(r'\w\w\w', '@@abcd!!')
print m.group()
m = re.search(r'\w\w\w', '@@ab0!!')
print m.group()
In [6]:
m = re.search(r'[\w.-]+@[\w.-]+',
"My email is macmath22@gmail.com")
print m.group(), type(m.group())
print m.groups()
In [8]:
pattern = r'^0\d{2}-\d{3,4}-\d{4}'
m = re.search(pattern, '010-3457-6360')
print m.group()
In [9]:
variables = ['abc', '3dbd', 'a_bdd', 'good344', 'aB_23']
for v in variables:
m = re.search(r'^[a-zA-Z_]+[\w\d]*', v)
if m != None:
print m.group()
In [10]:
str = 'What a nice weather macmath22@gmail.com, test@test.com mina@minas.net'
replaced = re.sub(r'[\w\.-]+@[\w\.-]+', r'test', str)
print replaced