In [2]:
with open('0203.csv', 'r') as f:
for line in f.readlines():
#print line.strip()
pass
In [10]:
import requests
from bs4 import BeautifulSoup as bs
payload= {
'select_item':'1',
'select_subitem':'1',
'Submit':'%B7j%B4M'
}
res = requests.post('http://www.twse.com.tw/ch/listed/listing_profile_inquiry.php', data = payload)
res.encoding = 'big5'
soup = bs(res.text)
In [18]:
import pandas as pd
dfs = pd.read_html(soup.select('.board_prod')[0].prettify('utf-8'), encoding = 'utf-8')
df = dfs[0]
df.columns = df.iloc[1]
In [19]:
df = df.drop([0,1])
df
Out[19]:
In [20]:
df.head()
Out[20]:
In [21]:
df.describe()
Out[21]:
In [34]:
import sqlite3 as lite
con = lite.connect('stock.sqlite')
df.to_sql('stock', con=con)
In [1]:
import datetime
import os
import pandas
seattle_fire_responses = pandas.read_csv('0203.csv', parse_dates=[
'Datetime'
], usecols=[
'Address',
'Type',
'Datetime',
'Latitude',
'Longitude',
'Incident Number',
]).dropna()
len(seattle_fire_responses)
Out[1]:
In [2]:
seattle_fire_responses.head()
Out[2]:
In [9]:
import pandas as pd
df = pd.read_csv('data/gps.csv')
df.head()
Out[9]:
In [12]:
#df['city'].value_counts()
df['country'].value_counts()
df['country'].value_counts().head()
Out[12]:
In [14]:
coordinates = df.as_matrix(columns=['lon', 'lat'])
print type(coordinates)
In [15]:
%pylab inline
In [27]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6), dpi=100)
co_scatter = plt.scatter(coordinates[:,0], coordinates[:,1])
plt.show()
In [28]:
most_index = df['city'].value_counts().head().index
# list city within most index
most = pd.DataFrame(df[df['city'].isin(most_index)])
# drop duplicate city within index
most.drop_duplicates(subset=['city'], take_last=False, inplace=True)
In [29]:
print most
In [33]:
# lisf top 6 city index
most_index = df['city'].value_counts().head(6).index
# list city within most index
most = pd.DataFrame(df[df['city'].isin(most_index)])
# drop duplicate city within index
most.drop_duplicates(subset=['city'], take_last=False, inplace=True)
In [34]:
plt.figure(figsize=(10, 6), dpi=100)
co_scatter = plt.scatter(coordinates[:,0], coordinates[:,1])
In [36]:
for i, row in most.iterrows():
print i, row
In [38]:
plt.figure(figsize=(10, 6), dpi=100)
co_scatter = plt.scatter(coordinates[:,0], coordinates[:,1])
for i, row in most.iterrows():
plt.annotate(row['city'].decode('utf-8'),
xy=(row['lon'], row['lat']),
xytext=(row['lon'] + 1.5, row['lat'] + 0.6),
bbox=dict(boxstyle='round', color='k', fc='w', alpha=0.6),
xycoords='data',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5', color='k', alpha=0.8))
plt.show()
In [39]:
# lisf top 6 city index
most_index = df['country'].value_counts().index
# list city within most index
most = pd.DataFrame(df[df['country'].isin(most_index)])
# drop duplicate city within index
most.drop_duplicates(subset=['country'], take_last=False, inplace=True)
plt.figure(figsize=(10, 6), dpi=100)
co_scatter = plt.scatter(coordinates[:,0], coordinates[:,1])
for i, row in most.iterrows():
plt.annotate(row['country'].decode('utf-8'),
xy=(row['lon'], row['lat']),
xytext=(row['lon'] + 1.5, row['lat'] + 0.6),
bbox=dict(boxstyle='round', color='k', fc='w', alpha=0.6),
xycoords='data',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5', color='k', alpha=0.8))
plt.show()
In [41]:
import numpy as np
from sklearn.cluster import KMeans
# Compute clustering with Means
k_means = KMeans(n_clusters=10)
k_means.fit(coordinates)
k_means_cluster_centers = k_means.cluster_centers_
print k_means_cluster_centers
In [46]:
plt.figure(figsize=(10, 6), dpi=100)
plt.scatter(k_means_cluster_centers[:,0], k_means_cluster_centers[:,1], c='r', s=100)
plt.scatter(coordinates[:,0], coordinates[:,1], c='k', alpha=.3, s=10)
plt.show()
In [47]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.8, min_samples=2).fit(coordinates)
labels = db.labels_
print labels
In [48]:
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print set(labels)
In [49]:
from sklearn import metrics
print('Estimated number of clusters: %d' % num_clusters)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(coordinates, labels))
In [50]:
def getCentroid(points):
n = points.shape[0]
sum_lon = np.sum(points[:, 1])
sum_lat = np.sum(points[:, 0])
return (sum_lon/n, sum_lat/n)
In [51]:
clusters = pd.Series([coordinates[labels == i] for i in xrange(num_clusters)])
clusters
Out[51]:
In [57]:
print clusters[0]
def getCentroid(points):
n = points.shape[0]
sum_lon = np.sum(points[:, 1])
sum_lat = np.sum(points[:, 0])
return (sum_lon/n, sum_lat/n)
getCentroid(clusters[0])
Out[57]:
In [55]:
for c in clusters:
print len(c)
In [58]:
clusters = pd.Series([coordinates[labels == i] for i in xrange(num_clusters)])
lat = []
lon = []
for i, cluster in clusters.iteritems():
representative_point =getCentroid(cluster)
lat.append(representative_point[0])
lon.append(representative_point[1])
In [59]:
lat
Out[59]:
In [60]:
lon
Out[60]:
In [61]:
zip(lat, lon)
Out[61]:
In [65]:
plt.figure(figsize=(10, 6), dpi=100)
plt.scatter(lon, lat, c='r', s=100)
plt.scatter(coordinates[:,0], coordinates[:,1], c='k', alpha=.3, s=10)
for i, row in most.iterrows():
plt.annotate(row['country'].decode('utf-8'),
xy=(row['lon'], row['lat']),
xytext=(row['lon'] + 1.5, row['lat'] + 0.6),
bbox=dict(boxstyle='round', color='k', fc='w', alpha=0.6),
xycoords='data',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5', color='k', alpha=0.8))
plt.show()
In [66]:
plt.figure(figsize=(10, 6), dpi=100)
plt.scatter(k_means_cluster_centers[:,0], k_means_cluster_centers[:,1], c='r', s=100)
plt.scatter(coordinates[:,0], coordinates[:,1], c='k', alpha=.3, s=10)
for i, row in most.iterrows():
plt.annotate(row['country'].decode('utf-8'),
xy=(row['lon'], row['lat']),
xytext=(row['lon'] + 1.5, row['lat'] + 0.6),
bbox=dict(boxstyle='round', color='k', fc='w', alpha=0.6),
xycoords='data',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5', color='k', alpha=0.8))
plt.show()
In [162]:
print corpus[0]
In [169]:
import jieba.posseg as pseg
for ele, pos in pseg.cut("我爱北京天安门"):
print pos, ele
print ' '.join([ele for ele, pos in pseg.cut("我爱北京天安门") if pos in ['n', 'ns'] ])
In [170]:
from xml.dom import minidom
from xml.etree import ElementTree
import jieba
import jieba.posseg as pseg
with open('1435449602.xml', 'r') as f:
events=ElementTree.fromstring(f.read())
corpus = []
ary= []
for elem in events.findall('./channel/item'):
guid = elem.find('guid').text
title = elem.find('title').text
description = elem.find('description').text
pubDate = elem.find('pubDate').text
source = elem.find('source').text
ary.append(title)
corpus.append(' '.join([ele for ele, pos in pseg.cut(description) if pos in ['ns','vn','nr','nt','nz','an','n']] ))
In [171]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = vectorizer.fit_transform(corpus)
tfidf = transformer.fit_transform(X)
weight = tfidf.toarray()
In [172]:
db = DBSCAN(eps=0.6, min_samples=3,algorithm='brute', metric="cosine")
db_data = db.fit_predict(weight)
In [173]:
print db_data
In [174]:
labels = db.labels_
print labels
In [175]:
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print set(labels)
In [181]:
for idx, l in enumerate(db.labels_):
if l == 1:
print ary[idx]
In [182]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = vectorizer.fit_transform(corpus)
tfidf = transformer.fit_transform(X)
weight = tfidf.toarray()
In [185]:
from sklearn.metrics.pairwise import linear_kernel
#print linear_kernel(tfidf[0], tfidf).flatten()
cosine_similarities = linear_kernel(tfidf[0], tfidf).flatten()
print cosine_similarities
In [194]:
related_docs_indices = cosine_similarities.argsort()[::-1]
related_docs_indices
print ary[0]
for index in related_docs_indices:
if cosine_similarities[index] > 0.1:
print ary[index], cosine_similarities[index]
In [195]:
# pip install seaborn
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
In [200]:
n_cosine_similarities = linear_kernel(tfidf[0:10], tfidf[0:10])
print n_cosine_similarities
In [201]:
sns.heatmap(n_cosine_similarities, annot=True, center=0, cmap='coolwarm')
Out[201]:
In [215]:
import requests
from bs4 import BeautifulSoup as bs
rs = requests.session()
res = rs.get('http://bsr.twse.com.tw/bshtm/bsMenu.aspx')
soup = bs(res.text)
payload = {
'RadioButton_Normal':'RadioButton_Normal',
'TextBox_Stkno':'2330',
'btnOK':'查詢'
}
#'CaptchaControl1':'92D34',
for inp in soup.select('input[type==hidden]'):
payload[inp['name']] = inp['value']
In [216]:
res2 = rs.post('http://bsr.twse.com.tw/bshtm/bsMenu.aspx', data=payload)
In [218]:
res3 = rs.get('http://bsr.twse.com.tw/bshtm/bsContent.aspx?v=t')
#print res3.text
In [221]:
import pandas as pd
dfs = pd.read_html(res3.text.encode('utf-8'), encoding = 'utf-8')
In [222]:
dfs[0]
Out[222]:
In [ ]: