In [1]:
# Setup (Imports)
from datetime import datetime, timedelta
from collections import defaultdict

import requests
import random
import os
import re

import yqd

from Database import add_stock_ticks, add_headlines, clean_ticks, db

In [2]:
def consume_ticker_csv(stock, filename):
    """Loads data from csv file into database"""
    entries = []
    
    with open(os.path.join('..', 'data', filename), 'r') as tick_csv:
        
        for line in tick_csv:
            
            if "Date" not in line:
                
                date, open_, high, low, close, adj_close, volume = line.split(',')
                
                entries.append((stock, date, open_, high, low, close, adj_close, volume))
                
    add_stock_ticks(entries)
    
    clean_ticks()

def dl_ticker(stock, num_days=10):
    """Loads data from yahoo"""
    entries = []
    
    end_date = datetime.today()
    begin_date = end_date - timedelta(days=num_days)
    
    for line in yqd.load_yahoo_quote(stock, begin_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d')):
        
        if "Date" not in line and len(line) > 1:
                
                date, open_, high, low, close, adj_close, volume = line.split(',')
                
                entries.append((stock, date, open_, high, low, close, adj_close, volume))
                
    add_stock_ticks(entries)
    
    clean_ticks()

In [3]:
def basic_clean(text):
    return text.strip().replace("'", "'").replace(""", "").replace("&", "and").replace("(TM)", "")

def get_reddit_news(subs, search_terms, limit=None, praw_config='StockMarketML'):
    "Get headlines from Reddit"
    print('Downloading Reddit Posts: ' + ", ".join(subs))
    
    from praw import Reddit
    
    reddit = Reddit(praw_config)

    articles = defaultdict(list)
    
    used = []
    
    for term in search_terms:

        for submission in reddit.subreddit('+'.join(subs)).search(term, limit=limit):
            
            if submission.title.count(' ') > 4 and submission.title not in used:
                
                used.append(submission.title)
                
                date_key = datetime.fromtimestamp(submission.created).strftime('%Y-%m-%d')

                articles[date_key].append(submission.title)
        
    return articles

def get_reuters_news(stock, pages=80):
    """Get headlines from Reuters"""
    print('Downloading Reuters: ' + stock)
    
    found_headlines = []
    
    articles = defaultdict(list)
    
    pattern_headline = re.compile('<h2><a [\s\S]+?>([\s\S]+?)<\/a>[\s\S]*?<\/h2>')
    
    date_current = datetime.now()
    
    while pages > 0:

        text = requests.get('http://www.reuters.com/finance/stocks/company-news/{}?date={}'.format(stock, date_current.strftime('%m%d%Y')),  headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}).text
        
        for match in pattern_headline.finditer(text):
            
            headline = match.group(1)
            
            headline = headline.replace('\u200d', '').replace('\u200b', '')
            
            headline = re.sub('^[A-Z]+[A-Z\d\s]*\-', '', headline)
            
            date_key = date_current.strftime('%Y-%m-%d')
            
            if headline not in found_headlines:
            
                articles[date_key].append(headline)
                found_headlines.append(headline)
        
        pages -= 1
        
        date_current -= timedelta(days=1)
        
    return articles

def get_twitter_news(querys, limit=100):
    """Get headlines from Twitter"""
    print('Downloading Tweets: ' + ", ".join(querys))
    
    from twitter import Twitter, OAuth
    import twitter_creds as c # Self-Created Python file with Creds

    twitter = Twitter(auth=OAuth(c.ACCESS_TOKEN, c.ACCESS_SECRET, c.CONSUMER_KEY, c.CONSUMER_SECRET))
    
    limit = min(limit, 100)
    
    articles = defaultdict(list)
    
    for query in querys:
    
        tweets = twitter.search.tweets(q=query, result_type='popular', lang='en', count=limit)['statuses']
        
        for tweet in tweets:
            
            text = re.sub(r'https?:\/\/\S+', '', tweet['text'])
            text = re.sub(r'[^\w\s:/]+', '', text)
            
            date = tweet['created_at']
            
            if '\n' not in text and len(text) > len(query) and ' ' in text:
                
                date_key = datetime.strptime(date, "%a %b %d %H:%M:%S %z %Y" ).strftime('%Y-%m-%d')
                
                articles[date_key].append(text)
                
    return articles

def get_seekingalpha_news(stock, pages=500):
    """Get headlines from SeekingAlpha"""
    print('Downloading SeekingAlpha: ' + stock)

    articles = defaultdict(list)

    re_headline = re.compile('<a class="market_current_title" [\s\S]+?>([\s\S]+?)<\/a>')
    re_dates = re.compile('<span class="date pad_on_summaries">([\s\S]+?)<\/span>')

    cookies = None

    for i in range(1, pages + 1):

        if i == 1:
            url = 'https://seekingalpha.com/symbol/{}/news'.format(stock)
        else:
            url = 'https://seekingalpha.com/symbol/{}/news/more_news_all?page={}'.format(stock, i)
            
        try:

            r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}, cookies=cookies)
        
        except Exception as e:
            
            print(e)
            continue
    
        text = r.text.replace('\\"', '"')
        cookies = r.cookies # SeekingAlpha wants cookies.

        headlines = [match.group(1) for match in re_headline.finditer(text)]
        dates = [match.group(1) for match in re_dates.finditer(text)]

        for headline, date in zip(headlines, dates):
            
            headline = headline.replace('(update)', '')
            
            date = date.replace('.', '')

            if 'Today' in date:
                date = datetime.today()
            elif 'Yesterday' in date:
                date = datetime.today() - timedelta(days=1)
            else:
                temp = date.split(',')
                if len(temp[0]) == 3:
                    date = datetime.strptime(temp[1], " %b %d").replace(year=datetime.today().year)
                else:
                    date = datetime.strptime("".join(temp[0:2]), "%b %d %Y")

            articles[date.strftime('%Y-%m-%d')].append(headline)

    return articles

def get_fool_news(stock, pages=40):
    "Get headlines from Motley Fool"
    print('Downloading MotleyFool: ' + stock)
    
    stock = stock.lower()
    
    re_headline = re.compile('<article id="article-\d+">[\s\S]+?">([\s\S]+?)<\/a>[\s\S]+?calendar"><\/i>([\s\S]+?20\d{2})')
    
    articles = defaultdict(list)
    
    for i in range(pages):
        
        if i == 0:
            url = "https://www.fool.com/quote/nasdaq/apple/{}/content".format(stock)
        else:
            url = "https://www.fool.com/quote/nasdaq/apple/{}/content/more?page={}".format(stock, i)
            
        try:
            
            text = requests.get(url).text
            
        except: # Timeout or something...
            
            pass
        
        headlines = [(match.group(1), match.group(2)) for match in re_headline.finditer(text)]
        
        for headline, date in headlines:
            
            date = datetime.strptime(date.strip(), "%b %d %Y")
            headline = basic_clean(headline)
            
            articles[date.strftime('%Y-%m-%d')].append(headline)
            
    return articles

def get_wsj(stock, pages=20):
    "Get headlines from WSJ"
    print('Downloading WSJ: ' + stock)
    
    re_headline = re.compile('<li class="\s+cr_pressRelease">[\s\S]+?"cr_dateStamp">([\s\S]+?)<\/li>[\s\S]+?href="http:\/\/www.wsj.com\/articles\S+?>([\s\S]+?)<\/a>')
    re_nextlink = re.compile('article_datetime" value="?(\d+\/\d+\/\d+)"?>[\s\S]+?article_docId" value="?(\d+)"?>[\s\S]+?newswire_datetime" value="?(\d+\/\d+\/\d+)"?>[\s\S]+?newswire_docId" value="?(\d+)"?>[\s\S]+?')
    
    url = "http://quotes.wsj.com/ajax/overview/5/US/{}?instrumentType=STOCK&significant=false".format(stock)
    
    articles = defaultdict(list)
    
    for i in range(pages):
        
        text = requests.get(url).text
        
        headlines = [(match.group(1), match.group(2)) for match in re_headline.finditer(text)]
        
        for date, headline in headlines:
            
            try:
                date = datetime.strptime(date.strip(), "%m/%d/%y")
            except:
                date = datetime.today()
                
            headline = basic_clean(headline)
            
            articles[date.strftime('%Y-%m-%d')].append(headline)
            
        nextpage_creds = re_nextlink.search(text)
        
        if nextpage_creds:
            
            nextpage_creds = [nextpage_creds.group(1), nextpage_creds.group(2), nextpage_creds.group(3), nextpage_creds.group(4)]
            url = "http://quotes.wsj.com/ajax/overview/5/US/{}?instrumentType=STOCK&significant=false&article_datetime={}&article_docId={}&newswire_datetime={}&newswire_docId={}".format(stock, *nextpage_creds)
        
        else:
            
            break
            
    return articles

def get_thestreet(stock, pages=60):
    "Get headlines from TheStreet"
    print('Downloading TheStreet: ' + stock)
    
    url = "https://www.thestreet.com/quote/{}/details/news?start=0&type=json".format(stock)
    
    articles = defaultdict(list)
    
    for i in range(pages):
    
        try:
            json = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}).json()
        except:
            break
        
        for story in json['stories']:
            
            if 'headline' and 'callout' in story and story['headline'] and story['callout']:
            
                headline = basic_clean(story['headline'])
                callout = basic_clean(story['callout'])
                date = datetime.strptime(story['publishDate'], "%Y-%m-%dT%H:%M:%SZ")

                articles[date.strftime('%Y-%m-%d')].append(headline)
                articles[date.strftime('%Y-%m-%d')].append(callout)
            
        url = "https://www.thestreet.com" + json['pagination']['nextDataUrl']
        
    return articles

In [4]:
def clean_headline(headline, dictionary):
    """
    Clean headline
    
    Removes extra chars and replaces words
    """
    headline = headline.lower()
    headline = re.sub('\d+%', 'STAT', headline)
    headline = re.sub('\b\d+\b', 'STAT', headline)
    headline = ''.join(c for c in headline if c in "abcdefghijklmnopqrstuvwxyz ")
    headline = re.sub('\s+', ' ', headline)
        
    for (word, replacement) in dictionary:
            
        headline = headline.replace(word, replacement)
        
    headline = headline.replace('STAT', '**STATISTIC**')
        
    headline = headline.replace('****', '** **') # Seperate joined kwords
    
    return headline.strip()

In [5]:
def save_headlines(headlines):
    """Save headlines to file"""
    
    for stock in headlines:
        
        entries = []
        
        with db() as (conn, cur):
        
            cur.execute("SELECT word, replacement FROM dictionary WHERE stock=? ORDER BY LENGTH(word) DESC", [stock])
            dictionary = cur.fetchall()
        
        for source in headlines[stock]:
            
            for date in headlines[stock][source]:
                
                for headline in headlines[stock][source][date]:
                    
                    cleaned_headline = clean_headline(headline, dictionary)
                    
                    entries.append((stock, date, source, cleaned_headline, headline, -999))
                    
        add_headlines(entries)

In [6]:
if __name__ == "__main__":
    
    headlines = {
            'GOOG': {
                'reddit': get_reddit_news(['google', 'Android', 'GooglePixel', 'news'], ['Google', 'pixel', 'android', 'stock']), 
                'reuters': get_reuters_news('GOOG.O'),
                'twitter': get_twitter_news(['@Google', '#Google', '#googlepixel', '#Alphabet']),
                'seekingalpha': get_seekingalpha_news('GOOG'),
                'fool': get_fool_news('GOOG'),
                'wsj': get_wsj('GOOG'),
                'thestreet': get_thestreet('GOOG')
            },
            'AAPL': {
                'reddit': get_reddit_news(['apple', 'ios', 'AAPL', 'news'], ['apple', 'iphone', 'ipad', 'ios', 'stock']), 
                'reuters': get_reuters_news('AAPL.O'),
                'twitter': get_twitter_news(['@Apple', '#Apple', '#IPhone', '#ios']),
                'seekingalpha': get_seekingalpha_news('AAPL'),
                'fool': get_fool_news('AAPL'),
                'wsj': get_wsj('AAPL'),
                'thestreet': get_thestreet('AAPL')
            },
            'MSFT': {
                'reddit': get_reddit_news(['microsoft', 'windowsphone', 'windows'], ['microsoft', 'phone', 'windows', 'stock']), 
                'reuters': get_reuters_news('MSFT.O'),
                'twitter': get_twitter_news(['@Microsoft', '#Windows', '#Microsoft', '#windowsphone']),
                'seekingalpha': get_seekingalpha_news('MSFT'),
                'fool': get_fool_news('MSFT'),
                'wsj': get_wsj('MSFT'),
                'thestreet': get_thestreet('MSFT')
            },
            'AMD': {
                'reddit': get_reddit_news(['Amd', 'AMD_Stock', 'pcmasterrace'], ['AMD', 'radeon', 'ryzen', 'stock']), 
                'reuters': get_reuters_news('AMD.O'),
                'twitter': get_twitter_news(['@AMD', '#AMD', '#Ryzen', '#radeon']),
                'seekingalpha': get_seekingalpha_news('AMD'),
                'fool': get_fool_news('AMD'),
                'wsj': get_wsj('AMD'),
                'thestreet': get_thestreet('AMD')
            },
            'AMZN': {
                'reddit': get_reddit_news(['amazon', 'amazonprime', 'amazonecho'], ['amazon', 'echo', 'prime', 'stock']), 
                'reuters': get_reuters_news('AMZN.O'),
                'twitter': get_twitter_news(['@amazon', '#Amazon', '#jeffbezos', '@amazonecho', '#amazonprime']),
                'seekingalpha': get_seekingalpha_news('AMZN'),
                'fool': get_fool_news('AMZN'),
                'wsj': get_wsj('AMZN'),
                'thestreet': get_thestreet('AMZN')
            },
            'INTC': {
                'reddit': get_reddit_news(['intel', 'hardware'], ['intel', 'cpu']),
                'reuters': get_reuters_news('INTC.O'),
                'twitter': get_twitter_news(['@intel']),
                'seekingalpha': get_seekingalpha_news('INTC'),
                'fool': get_fool_news('INTC'),
                'wsj': get_wsj('INTC'),
                'thestreet': get_thestreet('INTC')
            }
    }


Downloading Reddit Posts: google, Android, GooglePixel, news
Version 5.3.0 of praw is outdated. Version 5.4.0 was released Wednesday March 28, 2018.
Downloading Reuters: GOOG.O
Downloading Tweets: @Google, #Google, #googlepixel, #Alphabet
Downloading SeekingAlpha: GOOG
Downloading MotleyFool: GOOG
Downloading WSJ: GOOG
Downloading TheStreet: GOOG
Downloading Reddit Posts: apple, ios, AAPL, news
Downloading Reuters: AAPL.O
Downloading Tweets: @Apple, #Apple, #IPhone, #ios
Downloading SeekingAlpha: AAPL
Downloading MotleyFool: AAPL
Downloading WSJ: AAPL
Downloading TheStreet: AAPL
Downloading Reddit Posts: microsoft, windowsphone, windows
Downloading Reuters: MSFT.O
Downloading Tweets: @Microsoft, #Windows, #Microsoft, #windowsphone
Downloading SeekingAlpha: MSFT
Downloading MotleyFool: MSFT
Downloading WSJ: MSFT
Downloading TheStreet: MSFT
Downloading Reddit Posts: Amd, AMD_Stock, pcmasterrace
Downloading Reuters: AMD.O
Downloading Tweets: @AMD, #AMD, #Ryzen, #radeon
Downloading SeekingAlpha: AMD
Downloading MotleyFool: AMD
Downloading WSJ: AMD
Downloading TheStreet: AMD
Downloading Reddit Posts: amazon, amazonprime, amazonecho
Downloading Reuters: AMZN.O
Downloading Tweets: @amazon, #Amazon, #jeffbezos, @amazonecho, #amazonprime
Downloading SeekingAlpha: AMZN
Downloading MotleyFool: AMZN
Downloading WSJ: AMZN
Downloading TheStreet: AMZN
Downloading Reddit Posts: intel, hardware
Downloading Reuters: INTC.O
Downloading Tweets: @intel
Downloading SeekingAlpha: INTC
Downloading MotleyFool: INTC
Downloading WSJ: INTC
Downloading TheStreet: INTC

In [7]:
if __name__ == "__main__":

    save_headlines(headlines)

In [8]:
if __name__ == "__main__":
    
    dl_ticker('AAPL')
    dl_ticker('AMZN')
    dl_ticker('AMD')
    dl_ticker('GOOG')
    dl_ticker('MSFT')
    dl_ticker('INTC')