In [60]:
import pandas as pd
import urllib
import re
import sqlite3
import os
from xml.etree.ElementTree import parse

In [3]:
# open rss list
rss_list = pd.read_table('rss_list.txt', sep=',')

In [4]:
# TODO add log files -> work, crashes
# TODO add saving to s3
# TODO get rss_list from file
# TODO saving to sql database (?)

In [48]:
def get_rss(url):
    # function get data from rss url -> return readed root
    # required urllib2
    try:
        request = urllib.request.urlopen(url)
        tree = parse(request)
        root = tree.getroot()
        return root
    except Exception as e:
        # change to save in crashes log file
        print("there is problem with parsing this url, error: {}".format(e))

In [20]:
def rss_parse(root, site_name):
    # function parsed downloaded rss data -> return title, date, link, description, category
    # required xml.tree
    # TODO: parse enclosure url (?) for pic's
    
    title = []
    date = []
    link = []
    description = []
    category = []
    site = []
    
    for item in root.iterfind('channel/item'):
        title.append(item.findtext('title'))
        date.append(item.findtext('pubDate'))
        link.append(item.findtext('link'))
        description.append(item.findtext('description').strip())
        category.append(item.findtext('category'))
        print(item.findtext('enclosure url'))
        site.append(site_name)
    
    return [title, date, link, description, category, site]

In [12]:
def save_parsed(parsed_list, path, filename, mode):
    # function get pandas dataframe from parsed data and save it to csv file -> return csv file
    # required pandas
    # saving to s3 in separate function (?)
    
    # dataframe from list of lists
    df = pd.DataFrame(parsed_list)
    # transpose dataframe for getting wright structure
    df = df.transpose()
    # append column names
    df.columns=['title', 'date', 'link', 'description', 'category', 'site']
    
    if mode == 'csv':
        # save as csv file
        if os.path.isfile(path+filename):
            df.to_csv(path+filename, mode=mode, encoding='utf8', index=False)
        else:
            raise NameError('there is no file {}'.format(path+filename))
    elif mode == 'sqlite':
        # save as sqlite database
        df.to_sql()
    # clear memory from df
    del df

In [58]:
def create_sql_conn(sql_db_name, sql_total_table, sql_detail_table):
    # create sql database connection (using sqlite)
    # create databases if not exits
    # takes sql_db_name, sql_total and sql_detail table names
    
    conn = sqlite3.connect(sql_db_name)
    cursor = conn.cursor()

    sql = 'create table if not exists {} (id integer, title VARCHAR(255), timestamp integer,\
    link VARCHAR(255), description VARCHAR (1024), category VARCHAR(128), site VARCHAR(32))'.format(sql_total_table)
    cursor.execute(sql)
    conn.commit()
    
    sql = 'create table if not exists {} (id integer, text VARCHAR(10192), photo_id integer)'.format(sql_detail_table)
    cursor.execute(sql)
    conn.commit()

    cursor.close()
    
    return conn

In [ ]:
def save_pic(pic_url, pic_path):
    # save picture from url pic_url to local pic_path
    # pic_path format [path]/pic_name
    urllib.request.urlretrieve(pic_url,pic_path)

In [59]:
conn = create_sql_conn('data.db', 'total', 'detail')


Out[59]:
<sqlite3.Connection at 0x1f8347c4650>

In [52]:


In [53]:
#save_pic('https://icdn.lenta.ru/images/2017/06/12/13/20170612131102377/pic_998932297f82155b05d7e3c32f98df2e.jpg','1.jpg')

In [ ]: