In [3]:
import pandas.io.data as web
import pandas
from datetime import datetime
import random
import math
import urllib
from lxml import html
import csv
import json

In [4]:
url = "https://poloniex.com/public?command=returnChartData&currencyPair=BTC_ETH&start=1405699200&end=9999999999&period=86400"
eth_prices = urllib.urlopen(url).read()

outfilename = 'eth_prices.json'
outfile = open(outfilename,'w')
outfile.write(eth_prices)
outfile.close()

df = pandas.read_json(eth_prices)

outfilename = 'eth_prices.csv'
outfile = open(outfilename,'w')
outfile.write(df.to_csv(index =False, header=False, columns=['date','open','volume']))
outfile.close()

In [8]:
#reformatting org file
dates = []
prices = []
volumes = []
with open(outfilename, 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        if float(row[1]) > 0:
            #date_object = datetime.strptime(row['date'], '%d/%m/%Y %H:%M:%S')
            #dates.append(date_object.strftime("%Y-%m-%d"))
            dates.append(row[0])
            prices.append(float(row[1]))
            volumes.append(float(row[2]))

In [9]:
import random

def selectSampleIndex(listSize, sampleSize, maxSeqLen):
    if listSize <= 0 or sampleSize <= 0 or listSize < sampleSize or maxSeqLen >= listSize:
        return []
    
    resSample = set()
    while len(resSample) < sampleSize:
        pick = random.randint(0, listSize-maxSeqLen-1)
        resSample.add(pick)
    res = list(resSample)
    res.sort()
    return res

def selectPredictionIndex(listSize, marginDays):
    if listSize <= 0 or marginDays >= listSize:
        return []
    
    resPred = range(listSize-marginDays, listSize)
    return resPred
    
def getDiffValue(lastDayValue, targetDayValue):
    return float(targetDayValue - lastDayValue)

def getDiff(lastDayValue, targetDayValue):
    return getDiffValue(lastDayValue, targetDayValue)/float(targetDayValue)

In [10]:
maxSeqLen = 20
marginDays = 5
minGain = 0.05
numSamples = 200

In [11]:
resIdx = selectSampleIndex(len(prices)-5, numSamples, maxSeqLen)
resIdx[0:10], resIdx[-10:] #see first and last 10 index


Out[11]:
([0, 1, 4, 5, 6, 7, 8, 9, 10, 11],
 [255, 256, 257, 258, 259, 260, 261, 262, 264, 265])

In [12]:
len(dates)


Out[12]:
292

In [13]:
#build train file

outfilename = 'eth_train_data.csv'
outfile = open(outfilename,'w')

#print header
header = 'date,' + 'day'+',day'.join([`num+1` for num in xrange(maxSeqLen-marginDays)])+',vol'+',vol'.join([`num+1` for num in xrange(maxSeqLen-marginDays)])+',dayTarget'+',diffVal'+',diff'+',isBullish'+'\n'
outfile.write(header)

for idx in resIdx:
    isBullish = 0    
    diff = getDiff(prices[idx+(maxSeqLen-marginDays)-1], prices[idx+maxSeqLen])
    if diff > minGain:
        isBullish = 1
    elif diff < (-1*minGain):
        isBullish = -1
    else:
        isBullish = 0
    #print idx, tsla_prices[idx:idx+10], tsla_prices[idx+15], good, diff
    #print tsla_prices[idx:idx+maxSeqLen-5], tsla_prices[idx+maxSeqLen], isBullish
    result = prices[idx:idx+maxSeqLen-marginDays]  #prices of sequence of days
    result.insert(0, dates[idx+(maxSeqLen-marginDays)-1]) #date of last day
    result = result + volumes[idx:idx+maxSeqLen-marginDays] #volumes of sequence of days
    result.append(prices[idx+maxSeqLen]) #price of the day target
    result.append(getDiffValue(prices[idx+(maxSeqLen-marginDays)-1], prices[idx+maxSeqLen])) #diff value
    result.append(diff) #diff 
    result.append(isBullish) #isbullish
    trainset = ",".join([`x` for x in result])
    outfile.write(trainset)
    outfile.write('\n')

outfile.close()

In [11]:
#build prediction file

outfilename = 'eth_pred_data.csv'
outfile = open(outfilename,'w')

#print header
header = 'date,' + 'day'+',day'.join([`num+1` for num in xrange(maxSeqLen-marginDays)])+',vol'+',vol'.join([`num+1` for num in xrange(maxSeqLen-marginDays)])+'\n'
outfile.write(header)

idxPred = selectPredictionIndex(len(prices), marginDays)

maxSeqLen-marginDays
for idx in idxPred:
    result = prices[idx-(maxSeqLen-marginDays)+1:idx+1]  #prices of sequence of days
    result.insert(0, dates[idx]) #date of last day
    result = result + volumes[idx-(maxSeqLen-marginDays)+1:idx+1] #volumes of sequence of days
    predset = ",".join([`x` for x in result])
    outfile.write(predset)
    outfile.write('\n')

outfile.close()

In [15]:
idxPred


Out[15]:
[284, 285, 286, 287, 288]

In [16]:
prices[idxPred[4]]


Out[16]:
0.03168962

In [17]:
dates[idxPred[4]]


Out[17]:
'2016-05-22'

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: