In [ ]:
#installing pandas libraries
!pip install pandas-datareader
!pip install --upgrade html5lib==1.0b8
#There is a bug in the latest version of html5lib so install an earlier version
#Restart kernel after installing html5lib
In [2]:
import pandas as pd #pandas library
from pandas_datareader import data #data readers (google, html, etc.)
#The following line ensures that graphs are rendered in the notebook
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt #Plotting library
import datetime as dt #datetime for timeseries support
In [3]:
pd.DataFrame([[1,2,3],[1,2,3]],columns=['A','B','C'])
Out[3]:
In [40]:
df = pd.DataFrame([['r1','00','01','02'],['r2','10','11','12'],['r3','20','21','22']],columns=['row_label','A','B','C'])
print(id(df))
df.set_index('row_label',inplace=True)
print(id(df))
df
Out[40]:
In [39]:
data = {'nationality': ['UK', 'China', 'US', 'UK', 'Japan', 'China', 'UK', 'UK', 'Japan', 'US'],
'age': [25, 30, 15, np.nan, 25, 22, np.nan,45 ,18, 33],
'type': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'diabetes': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df=pd.DataFrame(data=data,index=labels)
#print(df[df['age'].between(20,30)])
#print(df.groupby('nationality').mean()['age'])
#print(df.sort_values(by=['age','type'],ascending=[False,True]))
df['nationality'] = df['nationality'].replace('US','United States')
print(df)
In [41]:
df.ix[1]
Out[41]:
In [42]:
df['B']
Out[42]:
In [43]:
df.loc['r1']
Out[43]:
In [44]:
df.iloc[0]
Out[44]:
In [45]:
df[['B','A']] #Note that the column identifiers are in a list
Out[45]:
In [46]:
df.loc['r2','B']
Out[46]:
In [47]:
df.loc['r2']['A']
Out[47]:
In [49]:
print(df)
print(df.loc['r1':'r2'])
In [50]:
df.loc['r1':'r2','B':'C']
Out[50]:
In [60]:
#df_list = pd.read_html('http://www.bloomberg.com/markets/currencies/major')
df_list = pd.read_html('http://www.waihuipaijia.cn/'
, encoding='utf-8')
print(len(df_list))
In [61]:
df = df_list[0]
print(df)
In [ ]:
df.set_index('Currency',inplace=True)
print(df)
In [ ]:
df.loc['EUR-CHF','Value']
In [ ]:
eur_usd = df.loc['EUR-USD']['Change'] #This is chained indexing
df.loc['EUR-USD']['Change'] = 1.0 #Here we are changing a value in a copy of the dataframe
print(eur_usd)
print(df.loc['EUR-USD']['Change']) #Neither eur_usd, nor the dataframe are changed
In [ ]:
eur_usd = df.loc['EUR-USD','Change'] #eur_usd points to the value inside the dataframe
df.loc['EUR-USD','Change'] = 1.0 #Change the value in the view
print(eur_usd) #eur_usd is changed (because it points to the view)
print(df.loc['EUR-USD']['Change']) #The dataframe has been correctly updated
In [62]:
from pandas_datareader import data
import datetime as dt
start=dt.datetime(2017, 1, 1)
end=dt.datetime.today()
print(start,end)
df = data.DataReader('IBM', 'google', start, end)
In [63]:
df
Out[63]:
In [64]:
df['UP']=np.where(df['Close']>df['Open'],1,0)
df
Out[64]:
In [65]:
df.describe()
Out[65]:
In [66]:
df['UP'].sum()/df['UP'].count()
Out[66]:
In [67]:
df['Close'].pct_change() #One timeperiod percent change
Out[67]:
In [71]:
n=2
df['Close'].pct_change(n) #n timeperiods percent change
Out[71]:
In [72]:
n=13
df['Close'].pct_change(n).mean()
Out[72]:
In [73]:
df['Close'].pct_change(n).rolling(21)
Out[73]:
In [74]:
n=13
df['Close'].pct_change(n).rolling(21).mean()
Out[74]:
In [75]:
ma_8 = df['Close'].pct_change(n).rolling(window=8).mean()
ma_13= df['Close'].pct_change(n).rolling(window=13).mean()
ma_21= df['Close'].pct_change(n).rolling(window=21).mean()
ma_34= df['Close'].pct_change(n).rolling(window=34).mean()
ma_55= df['Close'].pct_change(n).rolling(window=55).mean()
In [76]:
ma_8.plot()
ma_34.plot()
Out[76]:
In [77]:
import datetime
import pandas_datareader as data
start = datetime.datetime(2015,7,1)
end = datetime.datetime(2016,6,1)
solar_df = data.DataReader(['FSLR', 'TAN','RGSE','SCTY'],'google', start=start,end=end)['Close']
In [78]:
solar_df
Out[78]:
In [79]:
rets = solar_df.pct_change()
print(rets)
In [80]:
import matplotlib.pyplot as plt
plt.scatter(rets.FSLR,rets.TAN)
Out[80]:
In [81]:
plt.scatter(rets.RGSE,rets.TAN)
Out[81]:
In [82]:
plt.scatter(rets.SCTY,rets.TAN)
Out[82]:
In [83]:
solar_corr = rets.corr()
print(solar_corr)
In [84]:
plt.scatter(rets.mean(), rets.std())
plt.xlabel('Expected returns')
plt.ylabel('Standard deviations')
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
plt.annotate(
label,
xy = (x, y), xytext = (20, -20),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
plt.show()
In [85]:
import numpy as np
import statsmodels.api as sm
X=solar_df[['FSLR','RGSE','SCTY']]
X = sm.add_constant(X)
y=solar_df['TAN']
model = sm.OLS(y,X,missing='drop')
result = model.fit()
print(result.summary())
In [86]:
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(y)
ax.plot(result.fittedvalues)
Out[86]:
In [ ]: