In [102]:
import pandas as pd
import pandas.io.data as web
import numpy as np
import numpy.random as npr
import statsmodels.api as sm
import scipy.stats as scs
import matplotlib.pyplot as plt
%matplotlib inline
wrapper function for the describe function from
the scipy.stats sublibrary
the scipy.stats sublibrary
In [104]:
def print_statistics(array):
'''Prints selected statistics.
Parameters
array: ndarray
object to generate statistics on
'''
sta=scs.describe(array)
print ' %14s %15s ' %(' statistics ' ,' value' )
print 30* '-'
print ' %14s %15f ' %('size ' ,sta[0] )
print ' %14s %15f ' %('min ' , sta[1][0])
print ' %14s %15f ' %(' max ' ,sta[1][1] )
print ' %14s %15f ' %(' mean ' ,sta[2] )
print ' %14s %15f ' %(' std' ,np.sqrt(sta[3]) )
print ' %14s %15f ' %(' skew ' ,sta[4] )
print ' %14s %15f ' %(' kurtosis' , sta[5])
The function normality_tests combines three different statistical tests:
Skewness test ( skewtest ) This tests whether the skew of the sample data is “normal” (i.e., has a value close enough to zero). Kurtosis test ( kurtosistest )Similarly, this tests whether the kurtosis of the sample data is “normal”
(again, close enough to zero). Normality test ( normaltest ) This combines
the other two test approaches to test for normality.
In [105]:
def normality_tests(arr):
'''Tests for normality distribution of given data set.
Parameters
array: ndarray
object to generate statistics on
'''
print ' Skew of data set %14.3f' %scs.skew(arr)
print ' Skew test p-value %14.3f' %scs.skewtest(arr)[1]
print ' kurt of data set %14.3f' %scs.kurtosis(arr)
print ' kurt test p-value %14.3f' %scs.kurtosistest(arr)[1]
print ' Norm test p-values %14.3f' %scs.normaltest(arr)[1]
the German DAX index :^GDAXI the American S&P 500 index:
^GSPC YHOO: Yahoo MSFT: Micro Soft
In [106]:
symbols=['^GDAXI','^GSPC','YHOO','MSFT']
In [107]:
symbols
Out[107]:
In [108]:
data=pd.DataFrame()
for sym in symbols:
data[sym]=web.DataReader(sym,data_source='yahoo',start='1/1/2006')['Adj Close']
data=data.dropna()
In [109]:
data.info()
Index start at 100
In [110]:
(data/data.ix[0]*100).plot(figsize=(8,6))
Out[110]:
In [111]:
log_returns=np.log(data/data.shift(1))
In [112]:
log_returns.head()
Out[112]:
In [113]:
log_returns.hist(bins=50,figsize=(9,6))
Out[113]:
In [114]:
for sym in symbols:
print ' \n Results for symbols %s' %sym
print 30*'-'
log_data=np.array(log_returns[sym].dropna())
print_statistics(log_data)
In [115]:
sm.qqplot(log_returns['^GSPC'].dropna(),line='s')
plt.grid(True)
plt.xlabel('theoretical quantiles')
plt.ylabel('sample quantiles')
Out[115]:
In [116]:
sm.qqplot(log_returns['MSFT'].dropna(),line='s')
plt.grid(True)
plt.xlabel('theoretical quantiles')
plt.ylabel('sample quantiles')
Out[116]:
In [117]:
sm.qqplot(log_returns['^GDAXI'].dropna(),line='s')
plt.grid(True)
plt.xlabel('theoretical quantiles')
plt.ylabel('sample quantiles')
Out[117]:
In [118]:
sm.qqplot(log_returns['YHOO'].dropna(),line='s')
plt.grid(True)
plt.xlabel('theoretical quantiles')
plt.ylabel('sample quantiles')
Out[118]:
In [119]:
for sym in symbols:
print ' \nResults for symbol %s' %sym
print 32*'-'
log_data=np.array(log_returns[sym].dropna())
normality_tests(log_data)