In [60]:
import pandas as pd
import urllib
import re
import sqlite3
import os
from xml.etree.ElementTree import parse
In [3]:
# open rss list
rss_list = pd.read_table('rss_list.txt', sep=',')
In [4]:
# TODO add log files -> work, crashes
# TODO add saving to s3
# TODO get rss_list from file
# TODO saving to sql database (?)
In [48]:
def get_rss(url):
# function get data from rss url -> return readed root
# required urllib2
try:
request = urllib.request.urlopen(url)
tree = parse(request)
root = tree.getroot()
return root
except Exception as e:
# change to save in crashes log file
print("there is problem with parsing this url, error: {}".format(e))
In [20]:
def rss_parse(root, site_name):
# function parsed downloaded rss data -> return title, date, link, description, category
# required xml.tree
# TODO: parse enclosure url (?) for pic's
title = []
date = []
link = []
description = []
category = []
site = []
for item in root.iterfind('channel/item'):
title.append(item.findtext('title'))
date.append(item.findtext('pubDate'))
link.append(item.findtext('link'))
description.append(item.findtext('description').strip())
category.append(item.findtext('category'))
print(item.findtext('enclosure url'))
site.append(site_name)
return [title, date, link, description, category, site]
In [12]:
def save_parsed(parsed_list, path, filename, mode):
# function get pandas dataframe from parsed data and save it to csv file -> return csv file
# required pandas
# saving to s3 in separate function (?)
# dataframe from list of lists
df = pd.DataFrame(parsed_list)
# transpose dataframe for getting wright structure
df = df.transpose()
# append column names
df.columns=['title', 'date', 'link', 'description', 'category', 'site']
if mode == 'csv':
# save as csv file
if os.path.isfile(path+filename):
df.to_csv(path+filename, mode=mode, encoding='utf8', index=False)
else:
raise NameError('there is no file {}'.format(path+filename))
elif mode == 'sqlite':
# save as sqlite database
df.to_sql()
# clear memory from df
del df
In [58]:
def create_sql_conn(sql_db_name, sql_total_table, sql_detail_table):
# create sql database connection (using sqlite)
# create databases if not exits
# takes sql_db_name, sql_total and sql_detail table names
conn = sqlite3.connect(sql_db_name)
cursor = conn.cursor()
sql = 'create table if not exists {} (id integer, title VARCHAR(255), timestamp integer,\
link VARCHAR(255), description VARCHAR (1024), category VARCHAR(128), site VARCHAR(32))'.format(sql_total_table)
cursor.execute(sql)
conn.commit()
sql = 'create table if not exists {} (id integer, text VARCHAR(10192), photo_id integer)'.format(sql_detail_table)
cursor.execute(sql)
conn.commit()
cursor.close()
return conn
In [ ]:
def save_pic(pic_url, pic_path):
# save picture from url pic_url to local pic_path
# pic_path format [path]/pic_name
urllib.request.urlretrieve(pic_url,pic_path)
In [59]:
conn = create_sql_conn('data.db', 'total', 'detail')
Out[59]:
In [52]:
In [53]:
#save_pic('https://icdn.lenta.ru/images/2017/06/12/13/20170612131102377/pic_998932297f82155b05d7e3c32f98df2e.jpg','1.jpg')
In [ ]: