Stock Analysis

Stock Market Analysis

Welcome to your second data project! In this portfolio project we will be looking at data from the stock market, particularly some technology stocks. We will learn how to use pandas to get stock information, visualize different aspects of it, and finally we will look at a few ways of analyzing the risk of a stock, based on its previous performance history. We will also be predicting future stock prices through a Monte Carlo method!

We’ll be answering the following questions along the way:

What was the change in price of the stock over time?
What was the daily return of the stock on average?
What was the moving average of the various stocks?
What was the correlation between different stocks’ closing prices?
What was the correlation between different stocks’ daily returns?
How much value do we put at risk by investing in a particular stock?
How can we attempt to predict future stock behavior?

import pandas as pd
import numpy as np
from pandas import DataFrame, Series

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline

import pandas_datareader as pdr
from pandas_datareader import data, wb

from datetime import datetime

# list of each tech stocks

tech_list = ['AAPL','FB','AMZN','GOOG']

# making the appropriate timeframe
end = datetime.now()
start = datetime(end.year-1, end.month, end.day)

# Quick ex of all the stocks

temp = pdr.get_data_yahoo('FB')
temp.reset_index(level=0, inplace=True) ## Changing the index value

temp.head()

	Date	Open	High	Low	Close	Adj Close	Volume
0	2012-05-18	42.049999	45.000000	38.000000	38.230000	38.230000	573576400
1	2012-05-21	36.529999	36.660000	33.000000	34.029999	34.029999	168192700
2	2012-05-22	32.610001	33.590000	30.940001	31.000000	31.000000	101786600
3	2012-05-23	31.370001	32.500000	31.360001	32.000000	32.000000	73600000
4	2012-05-24	32.950001	33.209999	31.770000	33.029999	33.029999	50237200

for stock in tech_list:
    temp = pdr.get_data_yahoo(stock)
    temp.reset_index(level=0, inplace=True) ## Changing the index value
    
    globals()[stock] = temp[temp['Date'] >= start]

# Resetting all index 
AAPL.reset_index(inplace=True)
FB.reset_index(inplace=True)
AMZN.reset_index(inplace=True)
GOOG.reset_index(inplace=True)

# del the extra cols.
del AAPL['index']
del FB['index']
del AMZN['index']
del GOOG['index']

GOOG.head()

	level_0	Date	Open	High	Low	Close	Adj Close	Volume
0	0	2017-01-09	806.400024	809.966003	802.830017	806.650024	806.650024	1272400
1	1	2017-01-10	807.859985	809.130005	803.510010	804.789978	804.789978	1176800
2	2	2017-01-11	805.000000	808.150024	801.369995	807.909973	807.909973	1065900
3	3	2017-01-12	807.140015	807.390015	799.169983	806.359985	806.359985	1353100
4	4	2017-01-13	807.479980	811.223999	806.690002	807.880005	807.880005	1099200

GOOG.describe()

	level_0	Open	High	Low	Close	Adj Close	Volume
count	253.000000	253.000000	253.000000	253.000000	253.000000	253.000000	2.530000e+02
mean	126.000000	927.111421	932.969089	921.341079	927.840138	927.840138	1.467822e+06
std	73.179004	79.020587	80.376678	78.353580	79.435737	79.435737	6.384932e+05
min	0.000000	793.799988	801.190002	790.520020	795.695007	795.695007	5.370000e+05
25%	63.000000	842.880005	844.909973	839.320007	841.650024	841.650024	1.086500e+06
50%	126.000000	931.469971	936.530029	924.590027	930.599976	930.599976	1.279500e+06
75%	189.000000	980.000000	985.424988	972.200012	977.000000	977.000000	1.620500e+06
max	252.000000	1109.400024	1111.270020	1101.619995	1106.939941	1106.939941	5.167700e+06

GOOG.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Data columns (total 8 columns):
level_0      253 non-null int64
Date         253 non-null datetime64[ns]
Open         253 non-null float64
High         253 non-null float64
Low          253 non-null float64
Close        253 non-null float64
Adj Close    253 non-null float64
Volume       253 non-null int64
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 15.9 KB

GOOG['Adj Close'].plot(legend=True,figsize = (10,4))

<matplotlib.axes._subplots.AxesSubplot at 0x1141b5908>

png

GOOG.plot(y="Volume",x="Date",legend=True,figsize = (10,4))

<matplotlib.axes._subplots.AxesSubplot at 0x1118cfb38>

png

Info on moving averages

http://www.investopedia.com/terms/m/movingaverage.asp

http://www.investopedia.com/articles/active-trading/052014/how-use-moving-average-buy-stocks.asp

ma_day = [10,20,50]

for ma in ma_day:
    colname = "MA for {0} days".format(ma)
    
    # For the AAPL STOCK
    AAPL[colname] = AAPL['Adj Close'].rolling(window=ma).mean()

AAPL.head(10)

	level_0	Date	Open	High	Low	Close	Adj Close	Volume	MA for 10 days	MA for 20 days	MA for 50 days
0	0	2017-01-09	117.949997	119.430000	117.940002	118.989998	117.106812	33561900	NaN	NaN	NaN
1	1	2017-01-10	118.769997	119.379997	118.300003	119.110001	117.224907	24462100	NaN	NaN	NaN
2	2	2017-01-11	118.739998	119.930000	118.599998	119.750000	117.854782	27588600	NaN	NaN	NaN
3	3	2017-01-12	118.900002	119.300003	118.209999	119.250000	117.362694	27086200	NaN	NaN	NaN
4	4	2017-01-13	119.110001	119.620003	118.809998	119.040001	117.156021	26111900	NaN	NaN	NaN
5	5	2017-01-17	118.339996	120.239998	118.220001	120.000000	118.100822	34439800	NaN	NaN	NaN
6	6	2017-01-18	120.000000	120.500000	119.709999	119.989998	118.090981	23713000	NaN	NaN	NaN
7	7	2017-01-19	119.400002	120.089996	119.370003	119.779999	117.884300	25597300	NaN	NaN	NaN
8	8	2017-01-20	120.449997	120.449997	119.730003	120.000000	118.100822	32597900	NaN	NaN	NaN
9	9	2017-01-23	120.000000	120.809998	119.769997	120.080002	118.179558	22050200	117.70617	NaN	NaN

AAPL[["Adj Close", "MA for 10 days","MA for 20 days","MA for 50 days"]].plot(subplots=False, figsize = (10,4))
plt.show()

png

AAPL['Daily Returns'] = AAPL['Adj Close'].pct_change()

AAPL['Daily Returns'].plot(figsize=(14,6), legend=True, linestyle='--', marker='o')
plt.show()

png

sns.distplot(AAPL['Daily Returns'].dropna(), bins=100, color='purple')
plt.show()

png

AAPL['Daily Returns'].hist(bins=100)

<matplotlib.axes._subplots.AxesSubplot at 0x1188e0f60>

png

AAPL.head()

	level_0	Date	Open	High	Low	Close	Adj Close	Volume	MA for 10 days	MA for 20 days	MA for 50 days	Daily Returns
0	0	2017-01-09	117.949997	119.430000	117.940002	118.989998	117.106812	33561900	NaN	NaN	NaN	NaN
1	1	2017-01-10	118.769997	119.379997	118.300003	119.110001	117.224907	24462100	NaN	NaN	NaN	0.001008
2	2	2017-01-11	118.739998	119.930000	118.599998	119.750000	117.854782	27588600	NaN	NaN	NaN	0.005373
3	3	2017-01-12	118.900002	119.300003	118.209999	119.250000	117.362694	27086200	NaN	NaN	NaN	-0.004175
4	4	2017-01-13	119.110001	119.620003	118.809998	119.040001	117.156021	26111900	NaN	NaN	NaN	-0.001761

AAPL['Adj Close'].head()

  117.106812
  117.224907
  117.854782
  117.362694
  117.156021
Name: Adj Close, dtype: float64

GOOG['Adj Close'].head()

  806.650024
  804.789978
  807.909973
  806.359985
  807.880005
Name: Adj Close, dtype: float64

FB['Adj Close'].head()

  124.900002
  124.349998
  126.089996
  126.620003
  128.339996
Name: Adj Close, dtype: float64

#closing_df = pdr(tech_list, 'yahoo', start, end)['Adj Close']
closing_df = DataFrame({
    "AAPL": AAPL['Adj Close'],
    "GOOG": GOOG['Adj Close'],
    "FB": FB['Adj Close'],
    "AMZN": AMZN['Adj Close']
    
})
closing_df.head()

	AAPL	AMZN	FB	GOOG
0	117.106812	796.919983	124.900002	806.650024
1	117.224907	795.900024	124.349998	804.789978
2	117.854782	799.020020	126.089996	807.909973
3	117.362694	813.640015	126.620003	806.359985
4	117.156021	817.140015	128.339996	807.880005

big4_returns = closing_df.pct_change()

big4_returns.drop(big4_returns.index[0], inplace=True)

big4_returns.head()

	AAPL	AMZN	FB	GOOG
1	0.001008	-0.001280	-0.004404	-0.002306
2	0.005373	0.003920	0.013993	0.003877
3	-0.004175	0.018297	0.004203	-0.001919
4	-0.001761	0.004302	0.013584	0.001885
5	0.008064	-0.009081	-0.003662	-0.004048

for x in tech_list:
    for y in tech_list:
        if x != y:
            sns.jointplot(x, y, big4_returns)
            plt.title(('{0} and {1}').format(x,y)) # Various CI
            plt.show()

png

from IPython.display import SVG
SVG(url='http://upload.wikimedia.org/wikipedia/commons/d/d4/Correlation_examples2.svg')

svg

big4_returns.head()

	AAPL	AMZN	FB	GOOG
1	0.001008	-0.001280	-0.004404	-0.002306
2	0.005373	0.003920	0.013993	0.003877
3	-0.004175	0.018297	0.004203	-0.001919
4	-0.001761	0.004302	0.013584	0.001885
5	0.008064	-0.009081	-0.003662	-0.004048

# Pairplot
sns.pairplot(big4_returns.dropna())

<seaborn.axisgrid.PairGrid at 0x11b5e7978>

png

# create our pairplot

returns_fig = sns.PairGrid(big4_returns.dropna())

returns_fig.map_upper(plt.scatter,color='orange')
returns_fig.map_lower(sns.kdeplot,cmpa='cool_d')
returns_fig.map_diag(plt.hist,bins=30)
plt.show()

png

# create our pairplot

returns_fig = sns.PairGrid(closing_df.dropna())

returns_fig.map_upper(plt.scatter,color='orange')
returns_fig.map_lower(sns.kdeplot,cmpa='cool_d')
returns_fig.map_diag(plt.hist,bins=30)
plt.show()

png

big4_returns.corr()

	AAPL	AMZN	FB	GOOG
AAPL	1.000000	0.511322	0.545902	0.491142
AMZN	0.511322	1.000000	0.653747	0.670783
FB	0.545902	0.653747	1.000000	0.711600
GOOG	0.491142	0.670783	0.711600	1.000000

# Compute the correlation matrix
corr = big4_returns.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, annot=True, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<matplotlib.axes._subplots.AxesSubplot at 0x11a9b6588>

png

# Compute the correlation matrix
corr = closing_df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, annot=True, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<matplotlib.axes._subplots.AxesSubplot at 0x11b282400>

png

Risk Analysis

rets = big4_returns.dropna()
area = np.pi * 20
plt.scatter(x=rets.mean(), y=rets.std(), s=area)

plt.xlabel('Expected Return')
plt.ylabel('Risk')


# Label the scatter plots, for more info on how this is done, chekc out the link below
# http://matplotlib.org/users/annotations_guide.html
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (50, 50),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad=-0.3'))

png

Value at Risk

We can treat value at risk as the amount of money we could expect to lose (aka putting at risk) for a given confidence interval. Theres several methods we can use for estimating a value at risk. Let’s go ahead and see some of them in action.

Value at risk using the “bootstrap” method

For this method we will calculate the empirical quantiles from a histogram of daily returns. For more information on quantiles, check out this link: http://en.wikipedia.org/wiki/Quantile

# Note the use of dropna() here, otherwise the NaN values can't be read by seaborn
sns.distplot(AAPL['Daily Returns'].dropna(),bins=100,color='purple')

<matplotlib.axes._subplots.AxesSubplot at 0x11aa40630>

png

# The 0.05 empirical quantile of daily returns
rets['AAPL'].quantile(0.05)

-0.014911698036799348

** The 0.05 empirical quantile of daily returns is at -0.014. That means that with 95% confidence, our worst daily loss will not exceed 1.4%. If we have a 1 million dollar investment, our one-day 5% VaR is 0.014 * 1,000,000 = $14,000.**

from collections import defaultdict

# Let perform for all of them with differernt ranges
ranges = [0.025, 0.05, 0.1]
var_d = defaultdict(list)


for t_stock in tech_list:
    for val in ranges:
        ret = rets[t_stock].quantile(val)
        var_d[t_stock].append(ret)
var_d

defaultdict(list,
            {'AAPL': [-0.020318352248894993,
              -0.014911698036799348,
              -0.009775791014228942],
             'AMZN': [-0.023780200527053923,
              -0.015361122650408596,
              -0.010369152275866923],
             'FB': [-0.02066069191606408,
              -0.01549650704753944,
              -0.008454008608159902],
             'GOOG': [-0.02331393054836877,
              -0.014096341117248605,
              -0.008840928304327932]})

df_VaR = DataFrame(var_d)
df_VaR

	AAPL	AMZN	FB	GOOG
0	-0.020318	-0.023780	-0.020661	-0.023314
1	-0.014912	-0.015361	-0.015497	-0.014096
2	-0.009776	-0.010369	-0.008454	-0.008841

Efficient Market Hypothesis

The three versions of the efficient market hypothesis are varying degrees of the same basic theory. The weak form suggests that today’s stock prices reflect all the data of past prices and that no form of technical analysis can be effectively utilized to aid investors in making trading decisions. Advocates for the weak form efficiency theory allow that if fundamental analysis is used, undervalued and overvalued stocks can be determined, and investors can research companies’ financial statements to increase their chances of making higher-than-market-average profits.
The semi-strong form efficiency theory follows the belief that because all information that is public is used in the calculation of a stock’s current price, investors cannot utilize either technical or fundamental analysis to gain higher returns in the market. Those who subscribe to this version of the theory believe that only information that is not readily available to the public can help investors boost their returns to a performance level above that of the general market. Inside trading.
The strong form version of the efficient market hypothesis states that all information – both the information available to the public and any information not publicly known – is completely accounted for in current stock prices, and there is no type of information that can give an investor an advantage on the market. Advocates for this degree of the theory suggest that investors cannot make returns on investments that exceed normal market returns, regardless of information retrieved or research conducted.
https://www.investopedia.com/ask/answers/032615/what-are-differences-between-weak-strong-and-semistrong-versions-efficient-market-hypothesis.asp#ixzz52OLUr7S1

Value at Risk using the Monte Carlo method

Using the Monte Carlo to run many trials with random market conditions, then we’ll calculate portfolio losses for each trial. After this, we’ll use the aggregation of all these simulations to establish how risky the stock is.

We will use the geometric Brownian motion (GBM), which is technically known as a Markov process. This means that the stock price follows a random walk and is consistent with (at the very least) the weak form of the efficient market hypothesis (EMH): past price information is already incorporated and the next price movement is “conditionally independent” of past price movements.

DRIFT: Expected Periodic Daily Rate of Return (the rate with the greatest odds of returning)

This means that the past information on the price of a stock is independent of where the stock price will be in the future, basically meaning, you can’t perfectly predict the future solely based on the previous price of a stock.

The equation for geometric Browninan motion is given by the following equation:

We can mulitply both sides by the stock price (S) to rearrange the formula and solve for the stock price.

The first term is known as “drift”, which is the average daily return multiplied by the change of time.
The second term is known as “shock”, for each time period the stock will “drift” and then experience a “shock” which will randomly push the stock price up or down.

By simulating this series of steps of drift and shock thousands of times, we can begin to do a simulation of where we might expect the stock price to be.

For more info on the Monte Carlo method for stocks, check out the following link: http://www.investopedia.com/articles/07/montecarlo.asp

big4_returns.head()

	AAPL	AMZN	FB	GOOG
1	0.001008	-0.001280	-0.004404	-0.002306
2	0.005373	0.003920	0.013993	0.003877
3	-0.004175	0.018297	0.004203	-0.001919
4	-0.001761	0.004302	0.013584	0.001885
5	0.008064	-0.009081	-0.003662	-0.004048

# Setting time horizon

days = 365
dt = 1/days

rets.head()

	AAPL	AMZN	FB	GOOG
1	0.001008	-0.001280	-0.004404	-0.002306
2	0.005373	0.003920	0.013993	0.003877
3	-0.004175	0.018297	0.004203	-0.001919
4	-0.001761	0.004302	0.013584	0.001885
5	0.008064	-0.009081	-0.003662	-0.004048

def stock_monte_carlo(*args):
    start_price, days, mu, sigma = args
    
    price = np.zeros(days)
    price[0] = start_price
    
    shock = np.zeros(days)
    drift = np.zeros(days)
    
    for x in range(1, days):
        shock[x] = np.random.normal(loc=mu*dt, scale=sigma*np.sqrt(dt))
        
        drift[x] = mu*dt
        
        price[x] = price[x-1] +(price[x-1] * (drift[x] + shock[x]))
    
    return price

def get_metrics(tck, df):
    mu = df.mean()[tck]
    sigma = df.std()[tck]
    return(mu, sigma, tck)
    

def run_simulations(*args):
    days = 365
    start_price, mu, sigma = args
    
    runs = 10000
    simulation = np.zeros(runs)

    for run in range(runs):
        simulation[run] = stock_monte_carlo(start_price, days, mu, sigma)[days-1] # do not know why days-1
    return simulation

def plot_histogram(*args):
    simulation, start_price, tck = args
    
    q = np.percentile(simulation,1)
    plt.hist(simulation, 150)

    plt.figtext(0.62,0.8, s="Start Price: {0:.2f}".format(start_price))

    # Mean
    plt.figtext(0.62,0.7, s="Mean final price: {0:.2f}".format(simulation.mean()))

    # Variance
    plt.figtext(0.62,0.6, s="VaR(0.99): {0:.2f}".format(start_price-q))

    # Display 1 Percent Quantile
    plt.figtext(0.13,0.6,"q(0.99): {0:.2f}".format(q))

    # Plot a line at the 1% quantile result
    plt.axvline(x=q, linewidth=4, color='r')

    plt.title("Final price distribution for {0} Stock after 365 days".format(tck))

def create_monte_chart(*args):
    start_price = args[0]
    mu, sigma, tck = args[1]
    days = 365
        
    for run in range(100):
        plt.plot(stock_monte_carlo(start_price, days, mu, sigma))

    plt.xlabel("Days")
    plt.ylabel("Price")
    plt.title("Analysis for {0}".format(tck))
    plt.show()
    plt.close()
    

** The Mean Final price is close to the starting price because the expected return is close to 0. The VaR means that if we run this simulation after many times, we will encounter a lost of 2.63. This is NOT a big loss (indicated by the red line) **

start_price = AAPL['Adj Close'][0]
create_monte_chart(start_price, get_metrics('AAPL', rets))

png

start_price = FB['Adj Close'][0]
create_monte_chart(start_price, get_metrics('FB', rets))

png

start_price = AMZN['Adj Close'][0]
create_monte_chart(start_price, get_metrics('AMZN', rets))

png

start_price = GOOG['Adj Close'][0]
create_monte_chart(start_price, get_metrics('GOOG', rets))

png

rets.head()

	AAPL	AMZN	FB	GOOG
1	0.001008	-0.001280	-0.004404	-0.002306
2	0.005373	0.003920	0.013993	0.003877
3	-0.004175	0.018297	0.004203	-0.001919
4	-0.001761	0.004302	0.013584	0.001885
5	0.008064	-0.009081	-0.003662	-0.004048

start_price = FB['Adj Close'][0]

mu, sigma, tck = get_metrics('FB', rets)

simulations = run_simulations(start_price, mu, sigma)
plot_histogram(simulations, start_price, tck)

png

start_price = AAPL['Adj Close'][0]

mu, sigma, tck = get_metrics('AAPL', rets)

simulations = run_simulations(start_price, mu, sigma)
plot_histogram(simulations, start_price, tck)

png

start_price = AMZN['Adj Close'][0]

mu, sigma, tck = get_metrics('AMZN', rets)

simulations = run_simulations(start_price, mu, sigma)
plot_histogram(simulations, start_price, tck)

png

start_price = GOOG['Adj Close'][0]

mu, sigma, tck = get_metrics('GOOG', big4_returns)

simulations = run_simulations(start_price, mu, sigma)
plot_histogram(simulations, start_price, tck)