正規表達式


In [2]:
import re 
m = re.match(r"(\w+)@(\w+)", "david@largtidata.com") 
print m.groups()
 
m = re.match(r"(\w+)@([a-z.]+)", "david@largtidata.com") 
print m.groups() 

m = re.match(r"(\d+)\.(\d+)", "1999.5") 
print m.groups() 

m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "David Chiu") 
print m.group('first_name'), m.group('last_name')
 
str1 = 'scp file.txt root@10.0.0.1:./' 
m=re.search('^scp ([\w\.]+) (\w+)@([\w\.]+):(.+)',str1) 
if m: 
    print m.group(1), m.group(2), m.group(3), m.group(4)


('david', 'largtidata')
('david', 'largtidata.com')
('1999', '5')
David Chiu
file.txt root 10.0.0.1 ./

In [2]:
import re
str1 = "0912-345-678"
str2 = "0912-345678"

m = re.match(r"(\d{4}-?\d{3}-?\d{3})", str1)
print m.groups()
#print re.match(r"\d{4}-?\d{3}-?\d{3}", str2)


('0912-345-678',)

使用file IO讀進資料


In [6]:
# -*- coding: utf-8 -*-
f = open('gov/51239291_0607L1030328.txt', 'r') 
#print f.read() 
f.close()

把資料丟進BeautifulSoup


In [10]:
# -*- coding: utf-8 -*- 
from bs4 import BeautifulSoup 
f = open('gov/51239291_0607L1030328.txt', 'r')  
response_text = f.read() 
soup = BeautifulSoup(response_text) 
f.close()

抓出含有th 的列


In [6]:
# -*- coding: utf-8 -*- 
from bs4 import BeautifulSoup 
f = open('gov3/51239291_0607L1030328.txt', 'r') 
response_text = f.read() 
f.close() 

soup = BeautifulSoup(''.join(response_text)) 
tr = soup.findAll('tr') 
for row in tr[1:5]: 
    print row.find('th')


None
None
<th align="left" bgcolor="#DAEBED" class="T11b" valign="middle" width="200">
     機關代碼
    </th>
<th align="left" bgcolor="#DAEBED" class="T11b" valign="middle" width="200">
     機關名稱
    </th>

列出表格中所有th 名稱


In [12]:
for row in tr[1:10]: 
    th = row.find('th') 
    #print th.text.strip()
    if th is not None: 
        print th.text.strip()


機關代碼
機關名稱
單位名稱
機關地址
聯絡人
聯絡電話
傳真號碼

Strip 範例


In [45]:
s = " \t string test \t \n\n\r" ' \t string test \t \n\n\r' 
print s


 	 string test 	 

 	 string test 	 



In [46]:
print s.strip()


string test 	 

 	 string test

In [47]:
print s.rstrip()


 	 string test 	 

 	 string test

In [48]:
print s.lstrip()


string test 	 

 	 string test 	 


找出”機關代碼”


In [14]:
for row in tr: 
    th = row.find('th') 
    if th is not None: 
        if th.text.encode('utf-8').strip() == "機關代碼": 
        	print row.find('td').text.strip()


3.13.30

Dictionary 範例


In [27]:
#declare dictionary 
dic = {'a':100, 'b':"yes", 'c':0.98} 
print dic 

#get keys in dictionary 
print dic.keys() 

#get values in dictionary 
print dic.values() 

#get value of given key 
print dic['a'] 

#get value of given key 
print dic.get('a')


{'a': 100, 'c': 0.98, 'b': 'yes'}
['a', 'c', 'b']
[100, 0.98, 'yes']
100
100

Dictionary 範例(二)


In [28]:
#add entry into dictionary 
dic['d'] = 'new' 
print dic 

#add entry into dictionary 
dic.update({'e':123}) 
print dic 

# iter the dictionary 
for rec in dic: 
    print rec, dic[rec]


{'a': 100, 'c': 0.98, 'b': 'yes', 'd': 'new'}
{'a': 100, 'c': 0.98, 'b': 'yes', 'e': 123, 'd': 'new'}
a 100
c 0.98
b yes
e 123
d new

可以使用dictionary 管理


In [15]:
dic = {"機關代碼":"", "機關名稱":"", "機關地址":""} 
for row in tr: 
    th = row.find('th') 
    if th is not None: 
        if th.text.encode('utf-8').strip() in dic: 
            print row.find('td').text.strip()


3.13.30
臺灣糖業股份有限公司
701臺南市東區生產路68號

把資料塞到dictionary value


In [30]:
dic = {"機關代碼":"", "機關名稱":"", "機關地址":""} 
for row in tr: 
    th = row.find('th') 
    if th is not None: 
        if th.text.encode('utf-8').strip() in dic: 
            dic[th.text.encode('utf-8').strip()] = row.find('td').text.encode('utf-8').strip() 
print dic


{'\xe6\xa9\x9f\xe9\x97\x9c\xe5\x90\x8d\xe7\xa8\xb1': '\xe8\x87\xba\xe7\x81\xa3\xe7\xb3\x96\xe6\xa5\xad\xe8\x82\xa1\xe4\xbb\xbd\xe6\x9c\x89\xe9\x99\x90\xe5\x85\xac\xe5\x8f\xb8', '\xe6\xa9\x9f\xe9\x97\x9c\xe4\xbb\xa3\xe7\xa2\xbc': '3.13.30', '\xe6\xa9\x9f\xe9\x97\x9c\xe5\x9c\xb0\xe5\x9d\x80': '701\xe8\x87\xba\xe5\x8d\x97\xe5\xb8\x82\xe6\x9d\xb1\xe5\x8d\x80\xe7\x94\x9f\xe7\x94\xa2\xe8\xb7\xaf68\xe8\x99\x9f'}

把資料從字典中取出來


In [31]:
for rec in dic: 
    print rec, dic[rec]


機關名稱 臺灣糖業股份有限公司
機關代碼 3.13.30
機關地址 701臺南市東區生產路68號

取得標的分類


In [16]:
dic = {"標的分類":""} 
for row in tr: 
    th = row.find('th') 
    if th is not None: 
        if th.text.encode('utf-8').strip() in dic: 
            dic[th.text.encode('utf-8').strip()] = row.find('td').text.encode('utf-8').strip()
for rec in dic: 
    print rec, dic[rec]


標的分類 <勞務類>    
                            752
                            電信服務

使用repr 找出formatting


In [18]:
a = '\t\t string \n\t' 
print a 
print repr(a)


		 string 
	
'\t\t string \n\t'

看看string 內含什麼字元


In [19]:
for rec in dic: 
    print repr(rec), repr(dic[rec])


'\xe6\xa8\x99\xe7\x9a\x84\xe5\x88\x86\xe9\xa1\x9e' '<\xe5\x8b\x9e\xe5\x8b\x99\xe9\xa1\x9e>    \r\n                            752\r\n                            \xe9\x9b\xbb\xe4\xbf\xa1\xe6\x9c\x8d\xe5\x8b\x99'

將資料依分隔符號切成陣列


In [20]:
a = '123,555,111,99' 
print a.split(',') 
print a.split(',',1)


['123', '555', '111', '99']
['123', '555,111,99']

將陣列單元依分隔符號合成字串


In [21]:
a = '123,555,111,99' 
ary = a.split(',') 
print '|'.join(ary)


123|555|111|99

如何移除字串中內含的 換行符號及空白


In [22]:
for rec in dic: 
    print ''.join(dic[rec].split())


<勞務類>752電信服務

時間跟字串轉換


In [23]:
from datetime import date,datetime 
currenttime = datetime.now() 
print currenttime.strftime("%Y-%m-%d") 


a = '2014-05-03 14:00' 
print datetime.strptime(a, "%Y-%m-%d %H:%M")


2014-11-18
2014-05-03 14:00:00

取得年分


In [25]:
response_date = '102/12/10 10:30' 
getyear = response_date.split('/',1) 
print getyear


['102', '12/10 10:30']

轉換成西元年


In [40]:
bctime= str(int(getyear[0]) + 1911) + "/" + ''.join(getyear[1:]) 
print bctime


2013/12/10 10:30

轉換成時間格式


In [41]:
response_date = '102/12/10 10:30' 
getyear = response_date.split('/', 1) 
bctime= str(int(getyear[0]) + 1911) + "/" + ''.join(getyear[1:]) 
print datetime.strptime(bctime, "%Y/%m/%d %H:%M")


2013-12-10 10:30:00

使用資料宣告


In [28]:
tenderer_dic = {"廠商代碼":"tenderer_code", "廠商名稱":"tenderer_name", "是否得標":"awarded", "組織型態":"orgnization_type"} 
result_dic = {}

找到award_table_tr_3


In [29]:
tender_table = soup.find('table', { "class" : "table_block tender_table" }) 
award_table_tr_3 = tender_table.findAll( 'tr',{'class':'award_table_tr_3'})

取得廠商編號


In [30]:
import re 
m = re.match(r'投標廠商(\d+)' ,'投標廠商101') 
print m.group(1)


101

統整程式碼


In [39]:
for tr in award_table_tr_3: 
    tb =tr.find('table') 
    grp_num = 0 
    if tb is not None: 
        row = tb.findAll('tr') 
        for r in row: 
            th = r.find('th').text 
            m = re.match(r'投標廠商(\d+)' ,th.encode('utf-8').strip()) 
            if m is not None: 
                grp_num = int(m.group(1)) 
                result_dic[grp_num] = {'tenderer_num': grp_num} 
            else: 
                if th.encode('utf-8').strip() in tenderer_dic: 
                    result_dic[grp_num][tenderer_dic[th.encode('utf-8').strip()]] = r.find('td').text.strip()
#print result_dic

for rec in result_dic:
    for i in  result_dic[rec]:
        print i, result_dic[rec][i]


tenderer_name 中華電信股份有限公司臺灣南區電信分公司
orgnization_type 公司登記
tenderer_code 96979960
tenderer_num 1
awarded 是

處理金額問題


In [41]:
import re 
m = re.match( r"\$?-?([0-9,]+)", '$352,111元') 
print ''.join(m.group(1).split(','))


352111

整理品項跟得標廠商


In [44]:
award_dic = {}
tender_award_dic = {"得標廠商":"awarded_tenderer", "預估需求數量":"request_number", "決標金額":"tender_awarding_value", "底價金額":"floor_price_value"}
award_table_tr_4 = tender_table.findAll( 'tr',{'class':'award_table_tr_4'})		
for tr in award_table_tr_4:
	tb =tr.find('table')
	if tb is not None:
		row = tb.findAll('tr')
		item_num = 0
		grp_num = 0
		for r in row:
			if r.find('th') is not None:
				th = r.find('th').text
				#print r.find('th').text
				m = re.match(r'第(\d+)品項' ,th.encode('utf-8').strip())
				m2 = re.match(r'得標廠商(\d+)' ,th.encode('utf-8').strip())
				if m is not None:
					item_num =  int(m.group(1).decode('utf-8'))
					award_dic[item_num] = {}
				elif m2 is not None:
					grp_num =  int(m2.group(1).decode('utf-8'))
					award_dic[item_num][grp_num] = {}			
				else:
					if th.encode('utf-8').strip() in tender_award_dic:
						#print th.encode('utf-8').strip().decode('utf-8')
						if th.encode('utf-8').strip() == "決標金額" or th.encode('utf-8').strip() == "底價金額":
							m = re.match( r"\$?-?([0-9,]+)", "".join( r.find('td').text.split()))
							award_dic[item_num][grp_num][tender_award_dic[th.encode('utf-8').strip()]] =  int(''.join(m.group(0).split(',')))							
						else:
							award_dic[item_num][grp_num][tender_award_dic[th.encode('utf-8').strip()]] = r.find('td').text.strip()
#print award_dic

for rec in award_dic[1][1]:
    print rec, award_dic[1][1][rec]


request_number 1
awarded_tenderer 中華電信股份有限公司臺灣南區電信分公司
floor_price_value 305000
tender_awarding_value 301500

步驟 0 引用套件


In [45]:
# -*- coding: utf-8 -*- 
from bs4 import BeautifulSoup 
from datetime import datetime 
import os, sys, re

步驟1 建立資料抽出函式


In [46]:
def get_response_element (file_name): 
    f = open(file_name, 'r') 
    response_text = f.read() 
    f.close() 
    soup = BeautifulSoup(''.join(response_text)) 
    tender_table = soup.find('table', { "class" : "table_block tender_table" }) 
    tr = tender_table.findAll('tr') 
    return tr

步驟2 建立時間轉換函式


In [47]:
def date_conversion(element):
        m = re.match(r"(\d+/\d+/\d+)",element)
        if m is not None:
                t = m.group(1).split('/', 1)
                if t[0] != '':
                        return datetime.strptime(str(int(t[0]) + 1911) + "/" + (''.join(t[1:]).split()[0] ), "%Y/%m/%d")
        else:
                return None

步驟3 建立金額轉換函式


In [48]:
def money_conversion(element): 
    m = re.match( r"\$?-?([0-9,]+)", "".join(element.split())) 
    return int(''.join(m.group(0).split(',')))

步驟4 建立移除空白函式


In [49]:
def remove_space(element): 
    return "".join(element.split())

步驟5 建立轉換字典


In [50]:
name_map = {"機關代碼":"entity_code", "機關名稱":"procuring_entity","標案案號":"job_number","招標方式":"procurement_type",\
    "決標方式":"tender_awarding_type","標案名稱":"subject_of_procurement", "決標資料類別":"attr_of_tender_awarding", \
    "標的分類":"attr_of_procurement", "預算金額":"budget_value", "開標時間":"opening_date","決標公告日期":"tender_awarding_announce_date",\
    "歸屬計畫類別":"project_type","總決標金額":"total_tender_awarding_value","底價金額":"floor_price_value","決標日期":"tender_awarding_date", "pkAtmMain":"pkAtmMain"}

tender_awards_map = {"機關代碼":remove_space, "機關名稱":remove_space, "標案案號":remove_space, \
    "招標方式":remove_space,"決標方式":remove_space,"標案名稱":remove_space, "決標資料類別":remove_space, \
    "標的分類":remove_space, "預算金額":money_conversion, "開標時間":date_conversion,"歸屬計畫類別":remove_space, \
    "總決標金額":money_conversion,"底價金額":money_conversion,"決標日期":date_conversion, "決標公告日期":date_conversion}

tender_award_item_map = {"得標廠商":"awarded_tenderer", "預估需求數量":"request_number", "決標金額":"tender_awarding_value", "底價金額":"floor_price_value"}
tenderer_map = {"廠商代碼":"tenderer_code", "廠商名稱":"tenderer_name", "是否得標":"awarded", "組織型態":"orgnization_type"}

步驟6 整理機關、已公告、決標資料


In [51]:
def get_award_info_dic (element):
	returned_dic = {}
	for row in element:
		th = row.find('th')
		if th is not None and row.get('class') is not None:
		    th_name = th.text.encode('utf-8').strip()
		    if ('award_table_tr_1' or 'award_table_tr_2' or  'award_table_tr_6'  in row.get('class')) and  (th_name in  tender_awards_map):
		        returned_dic[name_map[th_name]]   =   tender_awards_map[th_name](row.find('td').text.strip())
	return returned_dic

步驟7 整理投標廠商


In [52]:
def get_tenderer_info_dic (element):
	returned_dic = {}
	grp_num = 0
	for tr in element:
		if  tr.get('class') is not None and tr.get('class')[0] == 'award_table_tr_3' and tr.find('table') is not None :
			row = tr.find('table').findAll('tr')          
			for r in row:
				if r.find('th') is not None:
					th = r.find('th').text.encode('utf-8').strip()
					m = re.match(r'投標廠商(\d+)' ,th)
					if m is not None:
						grp_num =  int(m.group(1).decode('utf-8'))
						returned_dic[grp_num] = {'tenderer_num': grp_num}
					else:
						if th in tenderer_map:
							returned_dic[grp_num][tenderer_map[th]] = r.find('td').text.strip()
	return returned_dic

步驟8 整理決標品項


In [53]:
def get_tender_award_item_dic (element):
	returned_dic = {}
	item_num = 0
	grp_num = 0
	for tr in element:
		if tr.get('class') is not None and tr.get('class')[0] == 'award_table_tr_4' and tr.find('table') is not None:
			row = tr.find('table').findAll('tr')
			for r in row:
				if r.find('th') is not None:
					th = r.find('th').text.encode('utf-8').strip()
					m = re.match(r'第(\d+)品項' ,th)
					m2 = re.match(r'得標廠商(\d+)' ,th)
					if m is not None:
						item_num =  int(m.group(1).decode('utf-8'))
						returned_dic[item_num] = {}
					elif m2 is not None:
						grp_num =  int(m2.group(1).decode('utf-8'))
						returned_dic[item_num][grp_num] = {}			
					else:
						if th in tender_award_item_map:
							if th == "決標金額" or th == "底價金額":
								returned_dic[item_num][grp_num][tender_award_item_map[th]] = money_conversion(r.find('td').text) 							
							else:
								returned_dic[item_num][grp_num][tender_award_item_map[th]] = r.find('td').text.strip()
	return returned_dic

步驟9 瀏覽整理過的資料


In [60]:
ele = get_response_element('gov3/51239291_0607L1030328.txt') 
award_info_dic = get_award_info_dic(ele) 
tenderer_info_dic = get_tenderer_info_dic(ele) 
tender_award_item_dic = get_tender_award_item_dic(ele)
#
#for i in award_info_dic:
#    print i, award_info_dic[i]
for i in  tender_award_item_dic[1][1]:
    print i , tender_award_item_dic[1][1][i]
#print tender_award_item_dic


 request_number 1
awarded_tenderer 中華電信股份有限公司臺灣南區電信分公司
floor_price_value 305000
tender_awarding_value 301500

使用python 連結SQLite


In [61]:
import sqlite3 as lite 
import sys 
con = None 
dbname = 'nsb.sqlite'
try: 
    con = lite.connect(dbname) # connect to database 
    cur = con.cursor() # create cursor 
    cur.execute('SELECT SQLITE_VERSION()') # selece database version  
    data = cur.fetchone() # fetch one data at a time 
    print "SQLite version: %s" % data 
except lite.Error, e: 
    print "Error %s:" % e.args[0] 
    sys.exit(1) 
finally: 
    if con: 
        con.close()


SQLite version: 3.7.9

使用python 連結SQLite (2)


In [62]:
import sqlite3 as lite 
import sys 
con = lite.connect("test.db") 
with con: 
    cur = con.cursor() # Drop Table If Exisits 
    cur.execute("DROP TABLE IF EXISTS PhoneAddress") 
    cur.execute("CREATE TABLE PhoneAddress(phone CHAR(10) PRIMARY KEY, address TEXT, name TEXT unique, age INT NOT NULL)") 
    cur.execute("INSERT INTO PhoneAddress VALUES('0912173381','United State','Jhon Doe',53)") 
    cur.execute("INSERT INTO PhoneAddress VALUES('0928375018','Tokyo Japan','MuMu Cat',6)") 
    cur.execute("INSERT INTO PhoneAddress VALUES('0957209108','Taipei','Richard',29)") 
    cur.execute("SELECT phone,address FROM PhoneAddress") 
    data = cur.fetchall() 
    for rec in data: 
        print rec[0], rec[1] 
con.close()


0912173381 United State
0928375018 Tokyo Japan
0957209108 Taipei

使用python 連結SQLite (3)


In [63]:
import sqlite3 as lite 
import sys 
con = lite.connect("test.db") 
with con:
    
    cur = con.cursor()
    # Read Meta Information
    cur.execute("PRAGMA table_info(PhoneAddress)")    
    rows = cur.fetchall()
    for row in rows:
        print row
    
con.close()


(0, u'phone', u'CHAR(10)', 0, None, 1)
(1, u'address', u'TEXT', 0, None, 0)
(2, u'name', u'TEXT', 0, None, 0)
(3, u'age', u'INT', 1, None, 0)

fetchone


In [65]:
import sqlite3 as lite 
import sys 
con = lite.connect("test.db") 
with con: 
    cur = con.cursor() # Drop Table If Exisits 
    cur.execute("SELECT phone,address FROM PhoneAddress") 
    data = cur.fetchone() 
    print data[0], data[1] 
    data1 = cur.fetchone() 
    print data1[0], data1[1] 
con.close()


0912173381 United State
0928375018 Tokyo Japan

fetchall


In [67]:
import sqlite3 as lite 
import sys 
con = lite.connect("test.db") 
with con: 
    cur = con.cursor() # Drop Table If Exisits 
    cur.execute("SELECT phone,address FROM PhoneAddress")
    data = cur.fetchone() 
    print data[0], data[1] 
    rows = cur.fetchall()
    for row in rows:
        print row

con.close()


0912173381 United State
(u'0928375018', u'Tokyo Japan')
(u'0957209108', u'Taipei')

建立資料庫連線


In [68]:
# -*- coding: utf-8 -*- 
import sqlite3

db = sqlite3.connect('tender.db') 
cur = db.cursor()

建立 Tender_awards


In [69]:
cur.execute('''CREATE TABLE IF NOT EXISTS Tender_awards(
                id INTEGER PRIMARY KEY,
                pkAtmMain TEXT,
                procuring_entity TEXT,
                entity_code TEXT,
                attr_of_procurement TEXT,
                opening_date DATETIME,
                procurement_type TEXT,
                tender_awarding_type TEXT,
                project_type TEXT,
                subject_of_procurement TEXT,
                job_number TEXT,
                budget_value BIGINTEGER,
                attr_of_tender_awarding TEXT,
                floor_price_value BIGINTEGER,
                tender_awarding_announce_date DATETIME,
                tender_awarding_date DATETIME,
                total_tender_awarding_value BIGINTEGER
                )''')


Out[69]:
<sqlite3.Cursor at 0x980d6a0>

建立 Tenderer


In [70]:
cur.execute('''CREATE TABLE IF NOT EXISTS Tenderer(
    id INTEGER PRIMARY KEY,
    pkAtmMain TEXT,
    job_number TEXT,
    tenderer_code TEXT,
    tenderer_name TEXT,                
    awarded TEXT,
    orgnization_type TEXT,
    tenderer_num INT
    )''')


Out[70]:
<sqlite3.Cursor at 0x980d6a0>

建立Tender_award_item


In [71]:
cur.execute('''CREATE TABLE IF NOT EXISTS Tender_award_item(
    id INTEGER PRIMARY KEY,
    pkAtmMain TEXT,
    job_number TEXT,
    awarded_tenderer TEXT,
    request_number INT,                
    tender_awarding_value BIGINT,
    floor_price_value BIGINT,
    item_num INT,
    awarded_num INT
    )''')


Out[71]:
<sqlite3.Cursor at 0x980d6a0>

檢視dictionary 內的資料


In [72]:
ele = get_response_element('gov3/51239291_0607L1030328.txt') 
award_info_dic = get_award_info_dic(ele) 
tenderer_info_dic = get_tenderer_info_dic(ele) 
tender_award_item_dic = get_tender_award_item_dic(ele)

print award_info_dic
print tenderer_info_dic 
print tender_award_item_dic


{'opening_date': datetime.datetime(2014, 3, 28, 0, 0), 'total_tender_awarding_value': 301500, 'tender_awarding_date': datetime.datetime(2014, 3, 28, 0, 0), 'entity_code': u'3.13.30', 'procurement_type': u'\u9650\u5236\u6027\u62db\u6a19(\u672a\u7d93\u516c\u958b\u8a55\u9078\u6216\u516c\u958b\u5fb5\u6c42)', 'procuring_entity': u'\u81fa\u7063\u7cd6\u696d\u80a1\u4efd\u6709\u9650\u516c\u53f8', 'attr_of_procurement': u'<\u52de\u52d9\u985e>752\u96fb\u4fe1\u670d\u52d9', 'tender_awarding_announce_date': datetime.datetime(2014, 4, 30, 0, 0), 'subject_of_procurement': u'\u96fb\u8a71\u4ea4\u63db\u6a5f\u7dad\u8b77\u4fdd\u990a', 'project_type': u'\u975e\u5c6c\u611b\u53f0\u5341\u4e8c\u9805\u8a08\u756b', 'attr_of_tender_awarding': u'\u6c7a\u6a19\u516c\u544a', 'budget_value': 337500, 'floor_price_value': 305000, 'job_number': u'0607L1030328', 'tender_awarding_type': u'\u6700\u4f4e\u6a19'}
{1: {'tenderer_name': u'\u4e2d\u83ef\u96fb\u4fe1\u80a1\u4efd\u6709\u9650\u516c\u53f8\u81fa\u7063\u5357\u5340\u96fb\u4fe1\u5206\u516c\u53f8', 'orgnization_type': u'\u516c\u53f8\u767b\u8a18', 'tenderer_code': u'96979960', 'tenderer_num': 1, 'awarded': u'\u662f'}}
{1: {1: {'request_number': u'1', 'awarded_tenderer': u'\u4e2d\u83ef\u96fb\u4fe1\u80a1\u4efd\u6709\u9650\u516c\u53f8\u81fa\u7063\u5357\u5340\u96fb\u4fe1\u5206\u516c\u53f8', 'floor_price_value': 305000, 'tender_awarding_value': 301500}}}

將字典塞入資料庫


In [82]:
#print ', '.join(award_info_dic.keys())
columns = ', '.join(award_info_dic.keys()) 
#', '.join('?' * len(award_info_dic))
placeholders = ', '.join('?' * len(award_info_dic)) 
sql = 'INSERT INTO Tender_awards ({}) VALUES ({})'.format(columns, placeholders) 
#print sql
#cur.execute(sql, award_info_dic.values())
db.commit()

In [83]:
print tenderer_info_dic


{1: {'tenderer_name': u'\u4e2d\u83ef\u96fb\u4fe1\u80a1\u4efd\u6709\u9650\u516c\u53f8\u81fa\u7063\u5357\u5340\u96fb\u4fe1\u5206\u516c\u53f8', 'orgnization_type': u'\u516c\u53f8\u767b\u8a18', 'tenderer_code': u'96979960', 'tenderer_num': 1, 'awarded': u'\u662f'}}

建立insert 指令


In [84]:
tenderer_sql = 'INSERT INTO Tenderer (pkAtmMain, job_number, tenderer_code, tenderer_name, awarded, orgnization_type, tenderer_num) VALUES (?,?,?,?,?,?,?) '

filename = "gov3/51239291_0607L1030328.txt"
file_param = filename.split(".txt")[0].split("_") 
pkAtmMain = file_param[0] 
job_number = '_'.join(file_param[1:]) 

for ele in tenderer_info_dic : 
        cur.execute(tenderer_sql , (pkAtmMain, job_number,  
          tenderer_info_dic[ele]["tenderer_code"],   
          tenderer_info_dic[ele]["tenderer_name"], 
          tenderer_info_dic[ele]["awarded"], 
          tenderer_info_dic[ele]["orgnization_type"], ele))
db.commit() 
db.close()

多筆資料插入


In [85]:
import sqlite3 as lite

salary = (
    (1 ,          'Paul'  ,      32 ,         'California',  20000),
    (2 ,          'Allen' ,      25 ,         'Texas'     ,  15000),
    (3 ,          'Teddy' ,      23 ,         'Norway'    ,  20000),
    (4 ,          'Mark'  ,      25 ,         'Rich-Mond' ,  65000),
    (5 ,          'David' ,      27 ,         'Texas'     ,  85000),
    (6 ,          'Kim'   ,      22 ,         'South-Hall',  45000),
    (7 ,          'James' ,      24 ,         'Houston'   ,  10000)
)
department = (
(1,           'IT Billing'  ,1),
(2,           'Engineerin'  ,2),
(3,           'Finance'     ,7),
(4,           'Finance'     ,5),
(5,           'Finance'     ,6)

)

execute v.s. executemany


In [86]:
con = lite.connect('test.db')

with con:
    
    cur = con.cursor()    
    
    cur.execute("DROP TABLE IF EXISTS Salary")
    cur.execute("DROP TABLE IF EXISTS Department")
    
    cur.execute("CREATE TABLE Salary(ID INT, NAME TEXT, AGE INT, ADDRESS TEXT, SALARY INT, FOREIGN KEY(ID) REFERENCES QOO(EMP_ID))")
    cur.executemany("INSERT INTO Salary VALUES(?, ?, ?, ?, ?)", salary)
    
    
    cur.execute("CREATE TABLE Department(ID INT PRIMARY KEY, DEPT CHAR(50), EMP_ID INT)")
    cur.executemany("INSERT INTO Department VALUES(?, ?, ?)", department)
con.close()

execute 效能


In [87]:
import time
dbname = 'test.db'
con = lite.connect(dbname)

items = tuple(range(1,100000))
with con:
    
    cur = con.cursor()    
    
    cur.execute("DROP TABLE IF EXISTS ptest")
    cur.execute("CREATE TABLE ptest(ID INT)")
    start = time.time()
    for i in range(0,100000):
        cur.execute("INSERT INTO ptest VALUES(%s)"%(i))
    end = time.time()
    print end -start
    cur.execute("select count(1) from ptest")
    rows = cur.fetchone()
    print rows

con.close()


1.66068100929
(100000,)

executemany 效能


In [89]:
dbname = 'test.db'
con = lite.connect(dbname)
with con:
    
    cur = con.cursor()    
    
    cur.execute("DROP TABLE IF EXISTS ptest")
    cur.execute("CREATE TABLE ptest(ID INT)")
    start = time.time()
    cur.executemany("INSERT INTO ptest VALUES(?)",((id_, ) for id_ in xrange(100000)))
    end = time.time()
    print end - start
    cur.execute("select count(1) from ptest")
    rows = cur.fetchone()
    print rows

con.close()


0.21133685112
(100000,)

Commit v.s. Rollback


In [91]:
con = lite.connect('test.db')

Phone = '0928375018'
Age = 9 

with con:

    cur = con.cursor()    

    cur.execute("UPDATE PhoneAddress SET age=? WHERE phone=?", (Age, Phone))        
    con.commit()
    #con.rollback()
    
    cur.execute("select * from PhoneAddress where phone=?", (Phone,)) 
    rows = cur.fetchall()

    for row in rows:
        print row
con.close()


(u'0928375018', u'Tokyo Japan', u'MuMu Cat', 9)

建立函式 insert_award_info


In [92]:
def insert_award_info (cur, data_dic, filename):
	file_param =  filename.split(".txt")[0].split("_")
	pkAtmMain = file_param[0]
	data_dic['pkAtmMain'] = pkAtmMain
	columns = ', '.join(data_dic.keys())
	placeholders = ', '.join('?' * len(data_dic))
	sql = 'INSERT INTO Tender_awards ({}) VALUES ({})'.format(columns, placeholders)	
	cur.execute(sql, data_dic.values())

建立函式 insert_tenderer_info


In [93]:
tenderer_sql = 'INSERT INTO Tenderer (pkAtmMain, job_number, tenderer_code, tenderer_name, awarded, orgnization_type, tenderer_num) VALUES (?,?,?,?,?,?,?)'

def insert_tenderer_info (cur, data_dic, filename, sql):
	file_param =  filename.split(".txt")[0].split("_")
	pkAtmMain = file_param[0]
	job_number = '_'.join(file_param[1:])
	for ele in data_dic:			
		cur.execute(sql, (pkAtmMain, job_number,  data_dic[ele]["tenderer_code"], data_dic[ele]["tenderer_name"],
		  data_dic[ele]["awarded"], data_dic[ele]["orgnization_type"], ele))

建立函式 insert_tenderawarditem_info


In [94]:
tenderawarditem_sql = 'INSERT INTO Tender_award_item (pkAtmMain, job_number, awarded_tenderer, request_number, tender_awarding_value, floor_price_value, item_num, awarded_num) VALUES (?,?,?,?,?,?,?,?)' 

def insert_tenderawarditem_info(cur, data_dic, filename, sql):
	file_param =  filename.split(".txt")[0].split("_")
	pkAtmMain = file_param[0]
	job_number = '_'.join(file_param[1:])
	for item in data_dic:
		for grp in data_dic[item]:
			if 'floor_price_value' not in data_dic[item][grp]:
				data_dic[item][grp]['floor_price_value'] = None
			if 'tender_awarding_value' not in data_dic[item][grp]:
				data_dic[item][grp]['tender_awarding_value'] = None
			cur.execute(sql, (pkAtmMain, job_number,  data_dic[item][grp]["awarded_tenderer"],  data_dic[item][grp]["request_number"],
		           data_dic[item][grp]["tender_awarding_value"],  data_dic[item][grp]["floor_price_value"], item, grp))

設定表格


In [95]:
def setup_table(cur):
	cur.execute('''CREATE TABLE IF NOT EXISTS Tender_awards(
                id INTEGER PRIMARY KEY,
                pkAtmMain TEXT,
                procuring_entity TEXT,
                entity_code TEXT,
                attr_of_procurement TEXT,
                opening_date DATETIME,
                procurement_type TEXT,
                tender_awarding_type TEXT,
                project_type TEXT,
                subject_of_procurement TEXT,
                job_number TEXT,
                budget_value BIGINTEGER,
                attr_of_tender_awarding TEXT,
                floor_price_value BIGINTEGER,
                tender_awarding_announce_date DATETIME,
                tender_awarding_date DATETIME,
                total_tender_awarding_value BIGINTEGER
                )''')
		
	cur.execute('''CREATE TABLE IF NOT EXISTS Tenderer(
                id INTEGER PRIMARY KEY,
                pkAtmMain TEXT,
                job_number TEXT,
		tenderer_code TEXT,
		tenderer_name TEXT,                
		awarded TEXT,
		orgnization_type TEXT,
		tenderer_num INT
                )''')		
	cur.execute('''CREATE TABLE IF NOT EXISTS Tender_award_item(
                id INTEGER PRIMARY KEY,
                pkAtmMain TEXT,
                job_number TEXT,		
		awarded_tenderer TEXT,
		request_number INT,                
		tender_awarding_value BIGINT,
		floor_price_value BIGINT,
		item_num INT,
		awarded_num INT
                )''')

程式流程


In [96]:
import sqlite3
import os
db = sqlite3.connect('tender2.db') 
cur = db.cursor() 
path = "gov3/" 
dirs = os.listdir( path ) 
setup_table(cur)
for filename in dirs: 
    ele = get_response_element(path + filename) 
    award_info_dic = get_award_info_dic(ele) 
    tenderer_info_dic = get_tenderer_info_dic(ele) 
    tender_award_item_dic = get_tender_award_item_dic(ele) 
    insert_award_info(cur, award_info_dic, filename) 
    insert_tenderer_info(cur, tenderer_info_dic, filename, tenderer_sql) 
    insert_tenderawarditem_info(cur, tender_award_item_dic, filename, tenderawarditem_sql) 
    db.commit() 
db.close()

查詢資料


In [149]:
import sqlite3
import os
db = sqlite3.connect('tender.db') 
cur = db.cursor() 
cur.execute('select * from Tender_awards')
row = cur.fetchall()
for r in row[1:5]:
    print r[0], r[1], r[2],r[3],r[4],r[5]
db.close()


2 None 臺灣糖業股份有限公司 3.13.30 <勞務類>752電信服務 2014-03-28 00:00:00
3 51239883 新北市立大觀國民中學 3.82.6.2 <勞務類>641旅館及其他住宿服務 2013-10-23 00:00:00
4 51241099 彰化縣政府 3.76.47 <勞務類>95會員組織服務 2014-04-15 00:00:00
5 51241135 桃園縣桃園市公所 3.76.43.51 <勞務類>96娛樂,文化,體育服務 2014-04-25 00:00:00