#Data Download
import requests as re
from bs4 import BeautifulSoup
import yfinance as yf

#Data Management
import pandas as pd
import numpy as np

#Statistical Analysis
import statsmodels.api as sm

#Warnings
import warnings

#Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
#import ace_tools as tools


from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

class NasdaqScraper:
    def __init__(self, url):
        self.url = url

    def fetch_data(self):
        try:
            response = re.get(self.url)
            response.raise_for_status()
            return response.text
        except re.RequestException as e:
            print(f'Error fetching the URL: {e}')
            return None

    def parse_data(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        tables = soup.find_all('table', {'class': 'wikitable'})

        for table in tables:
            if 'Ticker' in str(table):
                target_table = table
                break
        else:
            raise ValueError("Couldn't find the target table")

        data = pd.read_html(str(target_table))
        return data[0]

    def get_company_ticker(self, display_flag=False):
        html_content = self.fetch_data()
        if html_content:
            nasdaq = self.parse_data(html_content)
            if display_flag:
                display(nasdaq)
            company_ticker = [(row['Company'], row['Ticker']) for index, row in nasdaq.iterrows()]
            return company_ticker
        else:
            return []

    def get_ticker_list(self, display_flag=False):
        return [x[1] for x in scraper.get_company_ticker(display_flag)]

    def get_comapny_list(self, display_flag=False):
        return [x[0] for x in scraper.get_company_ticker(display_flag)]

url = 'https://en.wikipedia.org/wiki/Nasdaq-100'
scraper = NasdaqScraper(url)
print(scraper.get_company_ticker(display_flag=True))

[('Adobe Inc.', 'ADBE'), ('ADP', 'ADP'), ('Airbnb', 'ABNB'), ('Alphabet Inc. (Class A)', 'GOOGL'), ('Alphabet Inc. (Class C)', 'GOOG'), ('Amazon', 'AMZN'), ('Advanced Micro Devices Inc.', 'AMD'), ('American Electric Power', 'AEP'), ('Amgen', 'AMGN'), ('Analog Devices', 'ADI'), ('Ansys', 'ANSS'), ('Apple Inc.', 'AAPL'), ('Applied Materials', 'AMAT'), ('ASML Holding', 'ASML'), ('AstraZeneca', 'AZN'), ('Atlassian', 'TEAM'), ('Autodesk', 'ADSK'), ('Baker Hughes', 'BKR'), ('Biogen', 'BIIB'), ('Booking Holdings', 'BKNG'), ('Broadcom Inc.', 'AVGO'), ('Cadence Design Systems', 'CDNS'), ('CDW Corporation', 'CDW'), ('Charter Communications', 'CHTR'), ('Cintas', 'CTAS'), ('Cisco', 'CSCO'), ('Coca-Cola Europacific Partners', 'CCEP'), ('Cognizant', 'CTSH'), ('Comcast', 'CMCSA'), ('Constellation Energy', 'CEG'), ('Copart', 'CPRT'), ('CoStar Group', 'CSGP'), ('Costco', 'COST'), ('CrowdStrike', 'CRWD'), ('CSX Corporation', 'CSX'), ('Datadog', 'DDOG'), ('DexCom', 'DXCM'), ('Diamondback Energy', 'FANG'), ('Dollar Tree', 'DLTR'), ('DoorDash', 'DASH'), ('Electronic Arts', 'EA'), ('Exelon', 'EXC'), ('Fastenal', 'FAST'), ('Fortinet', 'FTNT'), ('GE HealthCare', 'GEHC'), ('Gilead Sciences', 'GILD'), ('GlobalFoundries', 'GFS'), ('Honeywell', 'HON'), ('Idexx Laboratories', 'IDXX'), ('Illumina, Inc.', 'ILMN'), ('Intel', 'INTC'), ('Intuit', 'INTU'), ('Intuitive Surgical', 'ISRG'), ('Keurig Dr Pepper', 'KDP'), ('KLA Corporation', 'KLAC'), ('Kraft Heinz', 'KHC'), ('Lam Research', 'LRCX'), ('Linde plc', 'LIN'), ('Lululemon', 'LULU'), ('Marriott International', 'MAR'), ('Marvell Technology', 'MRVL'), ('MercadoLibre', 'MELI'), ('Meta Platforms', 'META'), ('Microchip Technology', 'MCHP'), ('Micron Technology', 'MU'), ('Microsoft', 'MSFT'), ('Moderna', 'MRNA'), ('Mondelēz International', 'MDLZ'), ('MongoDB Inc.', 'MDB'), ('Monster Beverage', 'MNST'), ('Netflix', 'NFLX'), ('Nvidia', 'NVDA'), ('NXP', 'NXPI'), ("O'Reilly Automotive", 'ORLY'), ('Old Dominion Freight Line', 'ODFL'), ('Onsemi', 'ON'), ('Paccar', 'PCAR'), ('Palo Alto Networks', 'PANW'), ('Paychex', 'PAYX'), ('PayPal', 'PYPL'), ('PDD Holdings', 'PDD'), ('PepsiCo', 'PEP'), ('Qualcomm', 'QCOM'), ('Regeneron', 'REGN'), ('Roper Technologies', 'ROP'), ('Ross Stores', 'ROST'), ('Sirius XM', 'SIRI'), ('Starbucks', 'SBUX'), ('Synopsys', 'SNPS'), ('Take-Two Interactive', 'TTWO'), ('T-Mobile US', 'TMUS'), ('Tesla, Inc.', 'TSLA'), ('Texas Instruments', 'TXN'), ('The Trade Desk', 'TTD'), ('Verisk', 'VRSK'), ('Vertex Pharmaceuticals', 'VRTX'), ('Walgreens Boots Alliance', 'WBA'), ('Warner Bros. Discovery', 'WBD'), ('Workday, Inc.', 'WDAY'), ('Xcel Energy', 'XEL'), ('Zscaler', 'ZS')]

print(f'Tickers Only:\n {scraper.get_ticker_list(display_flag=False)}')

Tickers Only:
 ['ADBE', 'ADP', 'ABNB', 'GOOGL', 'GOOG', 'AMZN', 'AMD', 'AEP', 'AMGN', 'ADI', 'ANSS', 'AAPL', 'AMAT', 'ASML', 'AZN', 'TEAM', 'ADSK', 'BKR', 'BIIB', 'BKNG', 'AVGO', 'CDNS', 'CDW', 'CHTR', 'CTAS', 'CSCO', 'CCEP', 'CTSH', 'CMCSA', 'CEG', 'CPRT', 'CSGP', 'COST', 'CRWD', 'CSX', 'DDOG', 'DXCM', 'FANG', 'DLTR', 'DASH', 'EA', 'EXC', 'FAST', 'FTNT', 'GEHC', 'GILD', 'GFS', 'HON', 'IDXX', 'ILMN', 'INTC', 'INTU', 'ISRG', 'KDP', 'KLAC', 'KHC', 'LRCX', 'LIN', 'LULU', 'MAR', 'MRVL', 'MELI', 'META', 'MCHP', 'MU', 'MSFT', 'MRNA', 'MDLZ', 'MDB', 'MNST', 'NFLX', 'NVDA', 'NXPI', 'ORLY', 'ODFL', 'ON', 'PCAR', 'PANW', 'PAYX', 'PYPL', 'PDD', 'PEP', 'QCOM', 'REGN', 'ROP', 'ROST', 'SIRI', 'SBUX', 'SNPS', 'TTWO', 'TMUS', 'TSLA', 'TXN', 'TTD', 'VRSK', 'VRTX', 'WBA', 'WBD', 'WDAY', 'XEL', 'ZS']

class StockDataFetcher:
    def __init__(self):
        warnings.filterwarnings('always')

    def fetch_adj_close(self, tick_list):
        adjclose = {}

        for tick in tick_list:
            try:
                stock_data = yf.download(tick, start="2010-01-02", end="2024-04-22", progress=False)["Adj Close"]
                adjclose[tick] = stock_data
            except Exception as e:
                warnings.warn(f'Symbol {tick} Failed: {e}')

        stock_data_df = pd.DataFrame(adjclose)
        return stock_data_df

fetcher = StockDataFetcher()
stock_data_df = fetcher.fetch_adj_close(scraper.get_ticker_list(display_flag=False))
display(stock_data_df)

class StockDataAnalyzer:
    def __init__(self, stock_data):
        self.daily_returns_df = None
        self._analyse_data(stock_data)

    def _analyse_data(self, stock_data):
        # Convert dictionary to DataFrame
        stock_data_df = pd.DataFrame(stock_data)

        # Calculate daily returns
        tmp_daily_returns_df = stock_data_df.pct_change()

        # Identify columns with at least 2000 non-NaN values
        sufficient_data_mask = tmp_daily_returns_df.notna().sum() >= 2000
        filtered_columns = tmp_daily_returns_df.columns[sufficient_data_mask]
        print(f'Insufficient data for: {[x for x in tmp_daily_returns_df.columns[tmp_daily_returns_df.notna().sum() < 2000]]}')

        # Create a new DataFrame with these filtered ticker symbols
        filtered_daily_returns_df = tmp_daily_returns_df[filtered_columns]

        # Remove any remaining rows with NaN values
        self.daily_returns_df = filtered_daily_returns_df.dropna()




analyser = StockDataAnalyzer(stock_data_df)
display(analyser.daily_returns_df)

Insufficient data for: ['ABNB', 'CEG', 'CRWD', 'DDOG', 'DASH', 'GEHC', 'GFS', 'MRNA', 'MDB', 'PDD', 'TTD', 'ZS']

class DatasetDownloader:
    def __init__(self, url, index_col=0, parse_dates=[0]):
        self.url = url
        self.dataset = pd.read_csv(self.url, index_col=index_col, parse_dates=parse_dates)

    def display_dataset(self):
        if self.dataset is not None:
            display(self.dataset)
        else:
            print("Dataset does not exist")

    def get_sub_df_ticker(self, ticker, date, length_history):
        df_filtered = self.dataset
        date = pd.to_datetime(date)

        end = df_filtered.index.get_loc(date)
        start = max(0, end - length_history + 1)

        sub_df = df_filtered.iloc[start:end + 1]
        sub_df_ticker = sub_df[ticker]

        return sub_df_ticker

    def generate_standardized_matrix(self, start_date, end_date):
        df_filtered_new = self.dataset.loc[start_date:end_date]
        z_mtx_ret = (df_filtered_new - df_filtered_new.mean()) / df_filtered_new.std()

        return z_mtx_ret


url_nasdaq = 'https://raw.githubusercontent.com/Jandsy/ml_finance_imperial/main/Coursework/df_filtered_nasdaq_100.csv'
nasdaq_downloader = DatasetDownloader(url_nasdaq)
nasdaq_downloader.display_dataset()

class StockAnalysis(DatasetDownloader):
    def __init__(self, url):
        super().__init__(url)
        self.cumulative_returns = None

    def analyze(self):
        df_filtered_nasdaq_100 = self.dataset
        self.cumulative_returns = (1 + df_filtered_nasdaq_100).cumprod() - 1

        best_performance_stock = self.cumulative_returns.iloc[-1].idxmax()
        best_performance_value = self.cumulative_returns.iloc[-1].max()
        print(f'The best performing stock is {best_performance_stock} and has a cumulative return of {best_performance_value * 100:.3f} %')

        average_daily_return_aapl = df_filtered_nasdaq_100['AAPL'].mean()
        print(f'The average daily return of AAPL is {average_daily_return_aapl*100:.4f} %')

        min_return = df_filtered_nasdaq_100.min().min()
        min_return_stock = df_filtered_nasdaq_100.min().idxmin()
        min_return_date = df_filtered_nasdaq_100[df_filtered_nasdaq_100[min_return_stock] == min_return].index[0]
        print(f'The worst daily return of {min_return*100:.4f} was by {min_return_stock } on {min_return_date}')

        return best_performance_stock, best_performance_value, average_daily_return_aapl, min_return, min_return_stock, min_return_date

    def plot_results(self, best_performance_stock, best_performance_value, average_daily_return_aapl, min_return, min_return_stock, min_return_date):
        plt.figure(figsize=(14, 8))
        for stock in [best_performance_stock, 'AAPL', min_return_stock]:
            plt.plot(self.cumulative_returns.index, self.cumulative_returns[stock], label=stock)

        plt.annotate(f'Best performing stock: {best_performance_stock}\nCumulative Return: {best_performance_value:.2f}',
                     xy=(self.cumulative_returns.index[-1], self.cumulative_returns[best_performance_stock].iloc[-1]),
                     xytext=(self.cumulative_returns.index[-100], self.cumulative_returns[best_performance_stock].iloc[-1] + 0.5),
                     arrowprops=dict(facecolor='green', shrink=0.05))

        plt.annotate(f'AAPL: {average_daily_return_aapl:.4f}',
                     xy=(self.cumulative_returns.index[-1], self.cumulative_returns['AAPL'].iloc[-1]),
                     xytext=(self.cumulative_returns.index[-100], self.cumulative_returns['AAPL'].iloc[-1] - 0.5),
                     arrowprops=dict(facecolor='blue', shrink=0.05))

        plt.annotate(f'Worst daily return: {min_return:.4f}\nStock: {min_return_stock}\nDate: {min_return_date.date()}',
                     xy=(min_return_date, self.cumulative_returns[min_return_stock].loc[min_return_date]),
                     xytext=(min_return_date, self.cumulative_returns[min_return_stock].loc[min_return_date] - 0.5),
                     arrowprops=dict(facecolor='red', shrink=0.05))

        plt.title('Cumulative Returns of Selected Stocks')
        plt.xlabel('Date')
        plt.ylabel('Cumulative Return')
        plt.legend()
        plt.grid(True)
        plt.show()


url_nasdaq = 'https://raw.githubusercontent.com/Jandsy/ml_finance_imperial/main/Coursework/df_filtered_nasdaq_100.csv'
analyzer = StockAnalysis(url_nasdaq)
analyzer.display_dataset()
best_performance_stock, best_performance_value, average_daily_return_aapl, min_return, min_return_stock, min_return_date = analyzer.analyze()
analyzer.plot_results(best_performance_stock, best_performance_value, average_daily_return_aapl, min_return, min_return_stock, min_return_date)

The best performing stock is NVDA and has a cumulative return of 11158.843 %
The average daily return of AAPL is 0.1085 %
The worst daily return of -44.6458 was by FANG on 2020-03-09 00:00:00

fama_french_downloader = DatasetDownloader('https://raw.githubusercontent.com/Jandsy/ml_finance_imperial/main/Coursework/fama_french_dataset.csv')
fama_french_downloader.display_dataset()

#class DatasetDownloader:
#
#      .....
#      .....
#      .....
#
#
#    def get_sub_df_ticker(self, ticker, date, length_history):
#        df_filtered = self.dataset
#        date = pd.to_datetime(date)
#
#        end = df_filtered.index.get_loc(date)
#        start = max(0, end - length_history + 1)
#
#        sub_df = df_filtered.iloc[start:end + 1]
#        sub_df_ticker = sub_df[ticker]
#
#        return sub_df_ticker

nasdaq_downloader.get_sub_df_ticker("GOOG", '28-03-2024', 75)

Date
2023-12-11   -0.014198
2023-12-12   -0.007869
2023-12-13    0.002469
2023-12-14   -0.005748
2023-12-15    0.004805
                ...   
2024-03-22    0.020371
2024-03-25   -0.004085
2024-03-26    0.003639
2024-03-27    0.001582
2024-03-28    0.002106
Name: GOOG, Length: 75, dtype: float64

class DatasetFamaFrenchCombinator(DatasetDownloader):
    def __init__(self, url, fama_french_url='https://raw.githubusercontent.com/Jandsy/ml_finance_imperial/main/Coursework/fama_french_dataset.csv'):
        super().__init__(url)
        self.fama_french_data = pd.read_csv(fama_french_url, index_col=0, parse_dates=[0])

    def df_ticker_with_fama_french(self, ticker, date, length_history):
        sub_df = self.get_sub_df_ticker(ticker, date, length_history)
        sub_df = pd.DataFrame(sub_df)
        sub_df_famfre = pd.merge(sub_df, self.fama_french_data, left_index=True, right_index=True)
        sub_df_famfre[ticker] = sub_df_famfre[ticker] - sub_df_famfre['RF']

        return sub_df_famfre

    def extract_beta_fama_french(self, ticker, date, length_history):
        sub_df_ff = self.df_ticker_with_fama_french(ticker, date, length_history)
        endog = sub_df_ff[ticker]
        exog = sub_df_ff.drop([ticker, 'RF'], axis=1)
        exog = sm.add_constant(exog)
        ff_regression = sm.OLS(endog, exog)
        ff_res = ff_regression.fit()
        summary = ff_res.summary()

        return summary

url_nasdaq = 'https://raw.githubusercontent.com/Jandsy/ml_finance_imperial/main/Coursework/df_filtered_nasdaq_100.csv'
famafrenchcombinator = DatasetFamaFrenchCombinator(url_nasdaq)
famafrenchcombinator.df_ticker_with_fama_french("GOOG", '28-03-2024', 75)

#class DatasetFamaFrenchCombinator(DatasetDownloader):
#    def __init__(self, url, fama_french_url='https://raw.githubusercontent.com/Jandsy/ml_finance_imperial/main/Coursework/fama_french_dataset.csv'):
#        super().__init__(url)
#        self.fama_french_data = pd.read_csv(fama_french_url, index_col=0, parse_dates=[0])
#
#    def df_ticker_with_fama_french(self, ticker, date, length_history):
#        sub_df = self.get_sub_df_ticker(ticker, date, length_history)
#        sub_df = pd.DataFrame(sub_df)
#        sub_df_famfre = pd.merge(sub_df, self.fama_french_data, left_index=True, right_index=True)
#        sub_df_famfre[ticker] = sub_df_famfre[ticker] - sub_df_famfre['RF']
#
#        return sub_df_famfre
#
#    def extract_beta_fama_french(self, ticker, date, length_history):
#        sub_df_ff = self.df_ticker_with_fama_french(ticker, date, length_history)
#        endog = sub_df_ff[ticker]
#        exog = sub_df_ff.drop([ticker, 'RF'], axis=1)
#        exog = sm.add_constant(exog)
#        ff_regression = sm.OLS(endog, exog)
#        ff_res = ff_regression.fit()
#        summary = ff_res.summary()
#
#        return summary

famafrenchcombinator.extract_beta_fama_french("GOOG", '28-03-2024', 75)

famafrenchcombinator.extract_beta_fama_french("AAPL", '28-03-2024', 252)

#class DatasetDownloader:
#
#....
#....
#....
#
#    def generate_standardized_matrix(self, start_date, end_date):
#        df_filtered_new = self.dataset.loc[start_date:end_date]
#        z_mtx_ret = (df_filtered_new - df_filtered_new.mean()) / df_filtered_new.std()
#
#        return z_mtx_ret

nasdaq_downloader.generate_standardized_matrix('2023-03-29', '2024-03-28')

z = DatasetDownloader("https://raw.githubusercontent.com/Jandsy/ml_finance_imperial/main/Coursework/Z_matrix.csv", index_col='Date', parse_dates=True)
z.display_dataset()

z_matrix = z.generate_standardized_matrix('2023-03-29', '2024-03-28')
correlation_matrix = (1/(len(z_matrix) - 1)) * z_matrix.T @ z_matrix

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix , annot=False, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Standardized Returns')
plt.show()

corr = nasdaq_downloader.dataset.loc['2023-03-29':'2024-03-28'].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr , annot=False, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

comparison = (round(corr, 10) != round(correlation_matrix, 10))
print(f'The matrices have {comparison.sum().sum()} difference(s).')

The matrices have 0 difference(s).

# SVD
U, sig, V_t = np.linalg.svd(corr)
# Rank Eigenvalues
eigenvals = sig**2
eigenvals_sort = np.sort(eigenvals)[::-1]
# 20 components
components = U[:, :20]
#  Variance
total_var = np.sum(eigenvals)
var_detail = np.sum(eigenvals_sort[:20]) / total_var

print(f"Variance as per first 20 components: {var_detail:.2%}")
print(f"U Dimensions: {U.shape}")
print(f"Sigma Dimensions: {sig.shape}")
print(f"V_t Dimensions: {V_t.shape}")
print(f"Dimensions as per first 20 components: {components.shape}")

Variance as per first 20 components: 97.24%
U Dimensions: (89, 89)
Sigma Dimensions: (89,)
V_t Dimensions: (89, 89)
Dimensions as per first 20 components: (89, 20)

filtered_data = nasdaq_downloader.dataset.loc['2023-03-29':'2024-03-28']
num_factors = 20
hidden_factors = np.zeros((len(filtered_data), num_factors))
std_devs = filtered_data.std()

for factor_idx in range(num_factors):
    for time_idx in range(len(filtered_data)):
        hidden_factors[time_idx, factor_idx] = np.sum(U[:, factor_idx] * filtered_data.iloc[time_idx,:] / std_devs)

hidden_factors_df = pd.DataFrame(hidden_factors, index=filtered_data.index, columns=[f'Factor {i+1}' for i in range(num_factors)])


print(hidden_factors_df)

             Factor 1  Factor 2  Factor 3  Factor 4  Factor 5  Factor 6  \
Date                                                                      
2023-03-29 -10.424283 -1.138075 -1.480293 -0.186457 -0.582038 -0.676525   
2023-03-30  -4.058376  0.780596 -0.866114  1.093297  0.709433 -0.062999   
2023-03-31  -7.957391 -2.279708  1.483896  0.125774 -1.825139 -0.439494   
2023-04-03   1.276744 -2.094034  0.054996 -1.486560 -0.073547  2.262700   
2023-04-04   3.668430 -0.203267  2.609416  1.318320 -1.365567 -0.814777   
...               ...       ...       ...       ...       ...       ...   
2024-03-22   2.598819  1.641039  1.609681  0.896374  0.659736  0.276673   
2024-03-25   3.259112  0.057719 -0.669773  3.287633  0.329844  0.511798   
2024-03-26   0.880618  0.137350  1.121877  0.425881 -0.375024 -0.874097   
2024-03-27  -5.489034 -4.973189 -3.135930 -0.571722 -0.387586 -0.527250   
2024-03-28  -0.529879 -1.287829 -0.791073 -1.063999 -0.396991 -1.048591   

            Factor 7  Factor 8  Factor 9  Factor 10  Factor 11  Factor 12  \
Date                                                                        
2023-03-29  0.086540 -0.850962 -0.679929   3.028042  -0.488648  -0.147226   
2023-03-30 -0.518398 -0.276002  0.476243   0.159625   1.132403  -0.744968   
2023-03-31  1.845769  0.039154 -1.710312  -0.248188  -0.103724  -1.411337   
2023-04-03  1.487817  0.501928 -0.083578  -0.560563   3.937579   0.417043   
2023-04-04 -0.774608  0.957229  0.555045   0.448973   0.804289   1.009629   
...              ...       ...       ...        ...        ...        ...   
2024-03-22 -1.351981 -2.273511  1.010149  -1.587501  -0.195123   1.161935   
2024-03-25 -0.591008  0.118838  0.313206  -0.209847   0.453261   1.278960   
2024-03-26  0.452714  0.702955 -0.154314   0.168684  -0.843783   0.379232   
2024-03-27 -1.458816 -0.851723  1.499895   1.472318   0.431281  -1.848008   
2024-03-28 -0.815666 -0.689423  0.040676   0.677404   0.401216   1.277569   

            Factor 13  Factor 14  Factor 15  Factor 16  Factor 17  Factor 18  \
Date                                                                           
2023-03-29  -0.528948   1.166845  -1.240441   1.639840   2.581771   1.239550   
2023-03-30   0.562968  -0.791916  -0.415928   0.369473  -0.818889  -0.362620   
2023-03-31  -0.678455  -1.897609   0.719745  -0.034329  -0.630826   0.057326   
2023-04-03   1.834757  -1.341610   0.003802  -1.434112   0.744093  -1.787384   
2023-04-04   0.908321  -0.703360   1.765790   1.899763  -0.367422  -1.442137   
...               ...        ...        ...        ...        ...        ...   
2024-03-22   1.120854   0.284247  -0.584442  -0.722520  -0.654369  -1.472712   
2024-03-25   0.641076  -0.009521   0.209846  -1.233158  -0.277053  -0.441647   
2024-03-26   0.375260   0.031077  -0.058553  -0.022528  -0.326512  -0.178796   
2024-03-27   0.403076   2.510488  -0.684977   0.714748   0.292610  -1.292726   
2024-03-28   0.564032   0.498776   0.223895  -0.037107   0.473084  -0.584489   

            Factor 19  Factor 20  
Date                              
2023-03-29   2.276762  -0.137516  
2023-03-30  -0.970115  -0.919965  
2023-03-31  -0.363120  -0.113381  
2023-04-03  -1.237222  -1.187009  
2023-04-04  -0.672108   0.678469  
...               ...        ...  
2024-03-22  -1.448307   0.450464  
2024-03-25   1.084513  -1.200247  
2024-03-26   0.092274   0.165596  
2024-03-27  -0.131812   0.744423  
2024-03-28   0.618640   0.693011  

[252 rows x 20 columns]

aapl_returns = filtered_data['AAPL']

hidden_tmp = hidden_factors_df.loc['2023-03-29':'2024-03-28']
hidden_tmp = sm.add_constant(hidden_tmp)

model = sm.OLS(aapl_returns, hidden_tmp)
results = model.fit()


print("R-squared value:", results.rsquared)
display(results.summary())

R-squared value: 0.6154020479750228

def extract_s_scores(list_xi):
    if not list_xi:
        raise ValueError("The list is empty.")

    mu = sum(list_xi) / len(list_xi)
    sigma = (sum((xi - mu) ** 2 for xi in list_xi) / (len(list_xi) - 1)) ** 0.5

    s_scores = (list_xi[-1] - mu) / sigma
    return s_scores

s_scores = extract_s_scores([1.0, 2.0, 3.0, 4.0, 5.0])
print(f"s scores: {s_scores:.2f}")

s scores: 1.26

Z_hat = z_matrix.clip(lower=-3, upper=3)

display(Z_hat)
print("Total number of values greater than 3: ", (z_matrix > 3).sum().sum())
print("Total number of values less than -3: ", (z_matrix < -3).sum().sum(), f'\n')

Total number of values greater than 3:  146
Total number of values less than -3:  139

Z_hat_git = DatasetDownloader("https://raw.githubusercontent.com/Jandsy/ml_finance_imperial/main/Coursework/Z_hat.csv", index_col = 'Date', parse_dates=True)

Z_hat_git.display_dataset()

comparison =  round(Z_hat,10) != round(Z_hat_git.dataset,10)

print( f'\n', "Total number of differences between the Z_hat matricies: ", f'\n', comparison.sum().sum())
Z_hat = Z_hat_git

 Total number of differences between the Z_hat matricies:  
 0

Z_hat_flattened = Z_hat.dataset.T
Z_hat_flattened = Z_hat_flattened.iloc[::-1]

Z_train, Z_test = train_test_split(Z_hat_flattened, test_size=0.3, random_state = 42)


print(f'Training Data Shape: {Z_train.shape}')
print(f'Test Data Shape: {Z_test.shape}')

display(Z_train)

Training Data Shape: (62, 252)
Test Data Shape: (27, 252)

# Autoencoder Variant 2
input_dim = Z_train.shape[1]
hidden_layer = 20

input_layer = Input(shape=(input_dim,), name="input")
encoder = Dense(hidden_layer, activation='relu', use_bias=True, name="encoder")(input_layer)
decoder = Dense(input_dim, activation='tanh', use_bias=True, name="decoder")(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

autoencoder.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input (InputLayer)          [(None, 252)]             0         
                                                                 
 encoder (Dense)             (None, 20)                5060      
                                                                 
 decoder (Dense)             (None, 252)               5292      
                                                                 
=================================================================
Total params: 10,352
Trainable params: 10,352
Non-trainable params: 0
_________________________________________________________________

autoencoder.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input (InputLayer)          [(None, 252)]             0         
                                                                 
 encoder (Dense)             (None, 20)                5060      
                                                                 
 decoder (Dense)             (None, 252)               5292      
                                                                 
=================================================================
Total params: 10,352
Trainable params: 10,352
Non-trainable params: 0
_________________________________________________________________

# Train the model
model = autoencoder.fit(Z_train, Z_train,
                          epochs=20,
                          batch_size=8,
                          validation_split=0.2,
                          shuffle=True)

# Plotting training and validation loss
plt.plot(model.history['loss'], label='Training Loss')
plt.plot(model.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

Epoch 1/20
7/7 [==============================] - 1s 24ms/step - loss: 0.9303 - val_loss: 0.8712
Epoch 2/20
7/7 [==============================] - 0s 5ms/step - loss: 0.8660 - val_loss: 0.8482
Epoch 3/20
7/7 [==============================] - 0s 5ms/step - loss: 0.8332 - val_loss: 0.8329
Epoch 4/20
7/7 [==============================] - 0s 6ms/step - loss: 0.8088 - val_loss: 0.8175
Epoch 5/20
7/7 [==============================] - 0s 7ms/step - loss: 0.7837 - val_loss: 0.7999
Epoch 6/20
7/7 [==============================] - 0s 6ms/step - loss: 0.7557 - val_loss: 0.7780
Epoch 7/20
7/7 [==============================] - 0s 6ms/step - loss: 0.7258 - val_loss: 0.7518
Epoch 8/20
7/7 [==============================] - 0s 7ms/step - loss: 0.6926 - val_loss: 0.7279
Epoch 9/20
7/7 [==============================] - 0s 6ms/step - loss: 0.6641 - val_loss: 0.7091
Epoch 10/20
7/7 [==============================] - 0s 6ms/step - loss: 0.6406 - val_loss: 0.6946
Epoch 11/20
7/7 [==============================] - 0s 5ms/step - loss: 0.6213 - val_loss: 0.6834
Epoch 12/20
7/7 [==============================] - 0s 5ms/step - loss: 0.6053 - val_loss: 0.6753
Epoch 13/20
7/7 [==============================] - 0s 5ms/step - loss: 0.5923 - val_loss: 0.6689
Epoch 14/20
7/7 [==============================] - 0s 5ms/step - loss: 0.5805 - val_loss: 0.6631
Epoch 15/20
7/7 [==============================] - 0s 5ms/step - loss: 0.5705 - val_loss: 0.6573
Epoch 16/20
7/7 [==============================] - 0s 5ms/step - loss: 0.5610 - val_loss: 0.6519
Epoch 17/20
7/7 [==============================] - 0s 5ms/step - loss: 0.5518 - val_loss: 0.6475
Epoch 18/20
7/7 [==============================] - 0s 5ms/step - loss: 0.5440 - val_loss: 0.6432
Epoch 19/20
7/7 [==============================] - 0s 5ms/step - loss: 0.5352 - val_loss: 0.6396
Epoch 20/20
7/7 [==============================] - 0s 5ms/step - loss: 0.5270 - val_loss: 0.6365

Z_predict = autoencoder.predict(Z_test)

Z_test_predict = pd.DataFrame(Z_predict, columns = Z_test.columns, index = Z_test.index).T

Z_hat_test_T = Z_test.T

residuals_NVDA = Z_hat_test_T["NVDA"] - Z_test_predict["NVDA"]


plot_data = pd.DataFrame({"NVDA":Z_hat_test_T["NVDA"] , "NVDA predicted":Z_test_predict["NVDA"], "Residual":residuals_NVDA})

display(plot_data)

for series in plot_data:
  plot_data[series].plot(figsize=(14, 3))
  plt.title(f'Time Series Plot of {series}')
  plt.xlabel('Date')
  plt.ylabel('Values')
  plt.show()

1/1 [==============================] - 0s 22ms/step

Problem	Question	Number of Marks
Part A	Question 1	4
	Question 2	1
	Question 3	3
	Question 4	3
	Question 5	1
	Question 6	3
Part B	Question 7	1
	Question 8	5
	Question 9	4
	Question 10	5
	Question 11	2
	Question 12	3
Part C	Question 13	3
	Question 14	1
	Question 15	3
	Question 16	2
	Question 17	7
	Question 18	6
	Question 19	3
Part D	Question 20	3
	Question 21	5
	Question 22	2
Part E	Question 23	2
	Question 24	1
	Question 25	3
	Question 26	10
	Question 27	1
	Question 28	3
	Question 29	3
	Question 30	7

	ADBE	ADP	ABNB	GOOGL	GOOG	AMZN	AMD	AEP	AMGN	ADI	...	TSLA	TXN	TTD	VRSK	VRTX	WBA	WBD	WDAY	XEL	ZS
Date
2010-01-04	37.090000	26.725889	NaN	15.684434	15.610239	6.695000	9.700000	19.888445	41.200798	22.441114	...	NaN	17.924688	NaN	28.847162	44.240002	23.950077	15.840572	NaN	12.918811	NaN
2010-01-05	37.700001	26.582367	NaN	15.615365	15.541497	6.734500	9.710000	19.660757	40.843876	22.405684	...	NaN	17.821318	NaN	29.040443	42.779999	23.757448	16.463976	NaN	12.765595	NaN
2010-01-06	37.619999	26.519981	NaN	15.221722	15.149715	6.612500	9.570000	19.859974	40.536938	22.363167	...	NaN	17.690378	NaN	29.417345	42.029999	23.577662	16.709249	NaN	12.790113	NaN
2010-01-07	36.889999	26.507496	NaN	14.867367	14.797037	6.500000	9.470000	20.030745	40.165783	22.186016	...	NaN	17.745508	NaN	29.369020	41.500000	23.718924	16.699030	NaN	12.734949	NaN
2010-01-08	36.689999	26.470053	NaN	15.065566	14.994298	6.676000	9.430000	20.269810	40.522667	22.313566	...	NaN	18.152107	NaN	28.992126	40.669998	23.751024	16.750128	NaN	12.741086	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2024-04-15	470.100006	244.080002	155.600006	154.860001	156.330002	183.619995	160.320007	80.123955	263.637512	189.536087	...	161.479996	165.159637	80.989998	222.179993	397.359985	17.407288	8.360000	259.630005	53.169998	174.850006
2024-04-16	476.220001	244.210007	156.660004	154.399994	156.000000	183.320007	163.460007	78.737549	263.766602	190.472351	...	157.110001	166.390747	82.129997	222.100006	394.170013	17.397425	8.140000	257.690002	52.529999	174.320007
2024-04-17	474.450012	242.899994	158.369995	155.470001	156.880005	181.279999	154.020004	80.450737	262.207672	188.679489	...	155.449997	164.514282	80.129997	222.250000	393.100006	17.387562	8.230000	257.019989	53.189999	172.960007
2024-04-18	473.179993	241.990005	160.100006	156.009995	157.460007	179.220001	155.080002	81.757919	260.896973	186.836823	...	149.929993	162.498810	80.809998	223.330002	393.480011	17.348114	8.310000	255.639999	53.759998	172.970001
2024-04-19	465.019989	243.309998	155.009995	154.089996	155.720001	174.630005	146.639999	83.381981	267.033386	182.633545	...	147.050003	158.537354	77.300003	222.520004	394.279999	17.989174	8.400000	252.220001	54.720001	169.210007

	ADBE	ADP	GOOGL	GOOG	AMZN	AMD	AEP	AMGN	ADI	ANSS	...	TTWO	TMUS	TSLA	TXN	VRSK	VRTX	WBA	WBD	WDAY	XEL
Date
2015-12-10	-0.006699	0.006003	-0.003292	-0.002861	-0.003715	0.042553	-0.024356	0.011274	0.008826	0.004968	...	-0.004777	0.007485	0.011358	0.002995	-0.002616	0.010279	0.000960	-0.002109	0.005037	-0.013889
2015-12-11	0.027653	-0.024924	-0.012657	-0.014130	-0.033473	-0.036735	-0.005831	-0.028308	-0.003850	-0.013951	...	-0.011575	-0.009356	-0.044259	-0.012647	-0.015996	-0.034709	-0.020498	-0.035928	-0.057630	0.004312
2015-12-14	0.020127	0.015241	0.016151	0.012045	0.027744	-0.008475	-0.000367	0.019078	-0.001932	0.003899	...	0.005141	0.014444	0.007188	0.000178	0.014390	-0.016491	0.010402	-0.033613	0.000253	0.010017
2015-12-15	0.008149	0.010284	-0.003213	-0.005844	0.001110	0.008547	0.027503	0.028524	-0.004752	0.009544	...	0.023018	0.044907	0.011483	0.023657	0.013924	0.013656	-0.005451	0.002646	-0.000380	0.007934
2015-12-16	0.016380	0.008541	0.021708	0.019761	0.026008	0.076271	0.017309	0.012053	0.017330	0.008354	...	0.006389	0.025419	0.060699	0.009036	0.021117	0.010658	0.031665	0.024887	0.029378	0.023896
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2024-04-15	-0.008416	0.000943	-0.018196	-0.017966	-0.013485	-0.018128	-0.014494	-0.006622	-0.010298	-0.018073	...	-0.013377	-0.001437	-0.055949	0.000120	-0.001573	0.002043	-0.011205	0.002398	-0.015397	0.000000
2024-04-16	0.013018	0.000533	-0.002970	-0.002111	-0.001634	0.019586	-0.017303	0.000490	0.004940	-0.004114	...	0.001032	-0.000188	-0.027062	0.007454	-0.000360	-0.008028	-0.000567	-0.026316	-0.007472	-0.012037
2024-04-17	-0.003717	-0.005364	0.006930	0.005641	-0.011128	-0.057751	0.021758	-0.005910	-0.009413	-0.003641	...	-0.015744	0.001877	-0.010566	-0.011277	0.000675	-0.002715	-0.000567	0.011056	-0.002600	0.012564
2024-04-18	-0.002677	-0.003746	0.003473	0.003697	-0.011364	0.006882	0.016248	-0.004999	-0.009766	-0.003440	...	-0.017603	0.004747	-0.035510	-0.012251	0.004859	0.000967	-0.002269	0.009721	-0.005369	0.010716
2024-04-19	-0.017245	0.005455	-0.012307	-0.011050	-0.025611	-0.054424	0.019864	0.023520	-0.022497	-0.007365	...	-0.000284	0.009201	-0.019209	-0.024378	-0.003627	0.002033	0.036953	0.010830	-0.013378	0.017857

	ADBE	ADP	GOOGL	GOOG	AMZN	AMD	AEP	AMGN	ADI	ANSS	...	TTWO	TMUS	TSLA	TXN	VRSK	VRTX	WBA	WBD	WDAY	XEL
Date
2015-12-10	-0.006699	0.006004	-0.003292	-0.002861	-0.003715	0.042553	-0.024356	0.011274	0.008826	0.004968	...	-0.004777	0.007485	0.011358	0.002996	-0.002616	0.010279	0.000960	-0.002109	0.005037	-0.013889
2015-12-11	0.027653	-0.024924	-0.012657	-0.014130	-0.033473	-0.036735	-0.005831	-0.028308	-0.003850	-0.013951	...	-0.011575	-0.009356	-0.044259	-0.012648	-0.015996	-0.034709	-0.020499	-0.035928	-0.057630	0.004311
2015-12-14	0.020127	0.015241	0.016151	0.012045	0.027744	-0.008475	-0.000366	0.019078	-0.001932	0.003899	...	0.005141	0.014444	0.007188	0.000178	0.014390	-0.016491	0.010402	-0.033613	0.000253	0.010017
2015-12-15	0.008149	0.010283	-0.003213	-0.005844	0.001110	0.008547	0.027503	0.028525	-0.004752	0.009544	...	0.023018	0.044907	0.011483	0.023657	0.013923	0.013656	-0.005450	0.002646	-0.000380	0.007934
2015-12-16	0.016380	0.008541	0.021708	0.019761	0.026008	0.076271	0.017309	0.012053	0.017330	0.008354	...	0.006389	0.025419	0.060699	0.009036	0.021117	0.010658	0.031664	0.024887	0.029378	0.023897
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2024-05-06	0.015241	0.003514	0.005142	0.004971	0.013372	0.034396	0.002370	-0.037939	0.018484	0.006478	...	0.016863	-0.013548	0.019703	0.015427	0.019087	0.003540	-0.030881	-0.001255	-0.022949	0.002028
2024-05-07	-0.002674	0.009805	0.018739	0.018548	0.000318	-0.008666	0.011936	0.002738	0.001230	0.010728	...	-0.000067	-0.001109	-0.037616	0.012752	0.021252	0.019230	0.005214	-0.023869	-0.001921	0.012141
2024-05-08	-0.008471	-0.008894	-0.010920	-0.010521	-0.004026	-0.005245	0.007900	0.023343	0.006337	0.005907	...	-0.015910	0.003946	-0.017378	0.007007	-0.009838	0.020915	-0.006916	0.003861	0.000802	-0.001636
2024-05-09	-0.011166	0.009097	0.003424	0.002454	0.007979	-0.008007	0.004085	0.018060	-0.000342	0.000887	...	-0.001987	0.011361	-0.015739	0.007448	0.001676	0.000406	0.001161	0.030769	-0.014702	0.005644
2024-05-10	-0.000746	0.006975	-0.007708	-0.007518	-0.010660	-0.003084	0.007257	-0.008662	0.011719	0.003056	...	0.001373	-0.002915	-0.020352	0.009335	0.013593	0.009046	-0.003478	0.013682	0.001545	0.003983

	ADBE	ADP	GOOGL	GOOG	AMZN	AMD	AEP	AMGN	ADI	ANSS	...	TTWO	TMUS	TSLA	TXN	VRSK	VRTX	WBA	WBD	WDAY	XEL
Date
2015-12-10	-0.006699	0.006004	-0.003292	-0.002861	-0.003715	0.042553	-0.024356	0.011274	0.008826	0.004968	...	-0.004777	0.007485	0.011358	0.002996	-0.002616	0.010279	0.000960	-0.002109	0.005037	-0.013889
2015-12-11	0.027653	-0.024924	-0.012657	-0.014130	-0.033473	-0.036735	-0.005831	-0.028308	-0.003850	-0.013951	...	-0.011575	-0.009356	-0.044259	-0.012648	-0.015996	-0.034709	-0.020499	-0.035928	-0.057630	0.004311
2015-12-14	0.020127	0.015241	0.016151	0.012045	0.027744	-0.008475	-0.000366	0.019078	-0.001932	0.003899	...	0.005141	0.014444	0.007188	0.000178	0.014390	-0.016491	0.010402	-0.033613	0.000253	0.010017
2015-12-15	0.008149	0.010283	-0.003213	-0.005844	0.001110	0.008547	0.027503	0.028525	-0.004752	0.009544	...	0.023018	0.044907	0.011483	0.023657	0.013923	0.013656	-0.005450	0.002646	-0.000380	0.007934
2015-12-16	0.016380	0.008541	0.021708	0.019761	0.026008	0.076271	0.017309	0.012053	0.017330	0.008354	...	0.006389	0.025419	0.060699	0.009036	0.021117	0.010658	0.031664	0.024887	0.029378	0.023897
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2024-05-06	0.015241	0.003514	0.005142	0.004971	0.013372	0.034396	0.002370	-0.037939	0.018484	0.006478	...	0.016863	-0.013548	0.019703	0.015427	0.019087	0.003540	-0.030881	-0.001255	-0.022949	0.002028
2024-05-07	-0.002674	0.009805	0.018739	0.018548	0.000318	-0.008666	0.011936	0.002738	0.001230	0.010728	...	-0.000067	-0.001109	-0.037616	0.012752	0.021252	0.019230	0.005214	-0.023869	-0.001921	0.012141
2024-05-08	-0.008471	-0.008894	-0.010920	-0.010521	-0.004026	-0.005245	0.007900	0.023343	0.006337	0.005907	...	-0.015910	0.003946	-0.017378	0.007007	-0.009838	0.020915	-0.006916	0.003861	0.000802	-0.001636
2024-05-09	-0.011166	0.009097	0.003424	0.002454	0.007979	-0.008007	0.004085	0.018060	-0.000342	0.000887	...	-0.001987	0.011361	-0.015739	0.007448	0.001676	0.000406	0.001161	0.030769	-0.014702	0.005644
2024-05-10	-0.000746	0.006975	-0.007708	-0.007518	-0.010660	-0.003084	0.007257	-0.008662	0.011719	0.003056	...	0.001373	-0.002915	-0.020352	0.009335	0.013593	0.009046	-0.003478	0.013682	0.001545	0.003983

Machine Learning and Finance ¶

CourseWork 2024 - StatArb ¶

Overview¶

Coursework Goal¶

Outline¶

Objectives¶

1. Data Preparation and Exploration¶

2. Fama French Analysis¶

3. PCA Analysis¶

4. Ornstein-Uhlenbeck Process¶

5. Building a Basic Autoencoder Model¶

Libraries¶

Data Preparation and Exploration¶

Q2: (1 Mark)
Given a list of tuples representing NASDAQ-100 companies (where each tuple contains a company name and its ticker symbol), write a Python script to extract all ticker symbols into a separate list called `tickers_list`.

Fama French Analysis¶

Additional Factor¶

Mathematical Representation¶

PCA Analysis¶

Representation of Stock Return Data¶

Standardization of Returns¶

Empirical Correlation Matrix¶

Singular Value Decomposition (SVD)¶

Construction of Hidden Factors¶

Ornstein Uhlenbeck¶

The Ornstein-Uhlenbeck Process¶

Modeling the Ornstein-Uhlenbeck Process using Stochastic Differential Equations¶

Obtaining the Moments of the Ornstein-Uhlenbeck Process¶

Autoencoder Analysis¶

Architecture¶

Training¶

Hidden Factors Extraction¶

Summarize the Key Actions: Highlight the main experiments and methodologies employed by the authors in Section 5.¶

Reproduction Steps: Detail the necessary steps required to replicate the papers' approach based on the descriptions provided in the paper.¶

Proposed Improvement: Suggest one potential enhancement to the methodology that could potentially increase the effectiveness or efficiency of the model.¶

	Company	Ticker	GICS Sector	GICS Sub-Industry
0	Adobe Inc.	ADBE	Information Technology	Application Software
1	ADP	ADP	Industrials	Human Resource & Employment Services
2	Airbnb	ABNB	Consumer Discretionary	Hotels, Resorts & Cruise Lines
3	Alphabet Inc. (Class A)	GOOGL	Communication Services	Interactive Media & Services
4	Alphabet Inc. (Class C)	GOOG	Communication Services	Interactive Media & Services
...	...	...	...	...
96	Walgreens Boots Alliance	WBA	Consumer Staples	Drug Retail
97	Warner Bros. Discovery	WBD	Communication Services	Broadcasting
98	Workday, Inc.	WDAY	Information Technology	Application Software
99	Xcel Energy	XEL	Utilities	Multi-Utilities
100	Zscaler	ZS	Information Technology	Application Software

	Mkt-RF	SMB	HML	RMW	CMA	RF	Mom
1963-07-01	-0.67	0.02	-0.35	0.03	0.13	0.012	-0.21
1963-07-02	0.79	-0.28	0.28	-0.08	-0.21	0.012	0.42
1963-07-03	0.63	-0.18	-0.10	0.13	-0.25	0.012	0.41
1963-07-05	0.40	0.09	-0.28	0.07	-0.30	0.012	0.07
1963-07-08	-0.63	0.07	-0.20	-0.27	0.06	0.012	-0.45
...	...	...	...	...	...	...	...
2024-03-22	-0.23	-0.98	-0.53	0.29	-0.37	0.021	0.43
2024-03-25	-0.26	-0.10	0.88	-0.22	-0.17	0.021	-0.34
2024-03-26	-0.26	0.10	-0.13	-0.50	0.23	0.021	0.09
2024-03-27	0.88	1.29	0.91	-0.14	0.58	0.021	-1.34
2024-03-28	0.10	0.45	0.48	-0.07	0.09	0.021	-0.44

	GOOG	Mkt-RF	SMB	HML	RMW	CMA	RF	Mom
2023-12-11	-0.035198	0.35	-0.43	-0.12	0.02	0.65	0.021	-0.10
2023-12-12	-0.028869	0.34	-0.64	-0.88	-0.38	-0.22	0.021	0.58
2023-12-13	-0.018531	1.55	1.92	1.35	-0.99	0.17	0.021	-2.07
2023-12-14	-0.026748	0.51	1.84	2.05	0.05	0.41	0.021	-1.57
2023-12-15	-0.016195	-0.06	-0.59	-0.57	0.14	-0.35	0.021	0.93
...	...	...	...	...	...	...	...	...
2024-03-22	-0.000629	-0.23	-0.98	-0.53	0.29	-0.37	0.021	0.43
2024-03-25	-0.025085	-0.26	-0.10	0.88	-0.22	-0.17	0.021	-0.34
2024-03-26	-0.017361	-0.26	0.10	-0.13	-0.50	0.23	0.021	0.09
2024-03-27	-0.019418	0.88	1.29	0.91	-0.14	0.58	0.021	-1.34
2024-03-28	-0.018894	0.10	0.45	0.48	-0.07	0.09	0.021	-0.44

Dep. Variable:	GOOG	R-squared:	0.276
Model:	OLS	Adj. R-squared:	0.213
Method:	Least Squares	F-statistic:	4.330
Date:	Wed, 05 Jun 2024	Prob (F-statistic):	0.000931
Time:	19:43:57	Log-Likelihood:	214.68
No. Observations:	75	AIC:	-415.4
Df Residuals:	68	BIC:	-399.1
Df Model:	6
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-0.0210	0.002	-11.545	0.000	-0.025	-0.017
Mkt-RF	0.0110	0.003	3.298	0.002	0.004	0.018
SMB	0.0008	0.003	0.255	0.800	-0.006	0.007
HML	-0.0051	0.004	-1.347	0.182	-0.013	0.002
RMW	0.0105	0.004	2.341	0.022	0.002	0.019
CMA	-0.0040	0.008	-0.524	0.602	-0.019	0.011
Mom	-0.0027	0.004	-0.723	0.472	-0.010	0.005

Omnibus:	12.199	Durbin-Watson:	1.911
Prob(Omnibus):	0.002	Jarque-Bera (JB):	18.188
Skew:	-0.628	Prob(JB):	0.000112
Kurtosis:	5.059	Cond. No.	6.26

Dep. Variable:	AAPL	R-squared:	0.475
Model:	OLS	Adj. R-squared:	0.462
Method:	Least Squares	F-statistic:	36.96
Date:	Wed, 05 Jun 2024	Prob (F-statistic):	9.04e-32
Time:	19:43:57	Log-Likelihood:	827.28
No. Observations:	252	AIC:	-1641.
Df Residuals:	245	BIC:	-1616.
Df Model:	6
Covariance Type:	nonrobust

Omnibus:	36.898	Durbin-Watson:	1.622
Prob(Omnibus):	0.000	Jarque-Bera (JB):	83.475
Skew:	-0.702	Prob(JB):	7.48e-19
Kurtosis:	5.445	Cond. No.	4.05

	ADBE	ADP	GOOGL	GOOG	AMZN	AMD	AEP	AMGN	ADI	ANSS	...	TTWO	TMUS	TSLA	TXN	VRSK	VRTX	WBA	WBD	WDAY	XEL
Date
2023-03-29	0.658925	2.189521	0.106285	0.207821	1.503570	0.444433	1.020467	0.727398	1.860839	0.356414	...	0.517876	0.615242	0.814986	1.344650	1.105776	0.127319	0.497115	0.429736	2.277783	1.269711
2023-03-30	0.273051	-0.221306	-0.389329	-0.434766	0.787034	0.526996	0.277291	0.076716	1.630406	0.936497	...	-0.109884	0.439972	0.233493	1.187056	0.239468	-0.525980	0.691243	0.446100	0.389915	0.430715
2023-03-31	0.360570	1.136308	1.540732	1.439604	0.531735	-0.056439	0.475365	0.008639	0.935425	1.044069	...	1.337544	0.117646	2.058867	0.640237	0.325258	0.538943	-0.008816	0.565842	1.611597	0.597545
2023-04-03	-0.713053	-2.259591	0.252735	0.407399	-0.591892	-0.600163	-0.086824	0.759731	-0.328378	-0.555499	...	-0.377687	1.191774	-2.030038	-0.684867	-0.214460	0.183344	1.205798	-0.547975	-0.634285	0.121519
2023-04-04	0.560730	-1.137430	0.099652	0.013877	0.658632	-0.342217	0.223111	0.872403	-0.399698	-0.127907	...	1.434972	-0.347689	-0.377655	-1.394525	-0.538698	-0.487155	0.553153	0.755053	-0.537767	1.011197
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2024-03-22	-1.146803	-0.516681	1.151431	1.085070	0.074915	0.081847	-0.150701	-0.278013	-0.555224	0.126867	...	0.046870	-0.246035	-0.386620	-0.054031	-0.476030	-0.091837	-0.421327	-0.946797	0.106149	-0.002851
2024-03-25	0.659349	-1.221008	-0.372489	-0.341074	0.109668	-0.292706	-0.084892	1.184710	-0.959285	-0.282029	...	-2.575803	0.240999	0.343241	-0.651297	-1.151639	-0.024341	0.166127	0.118796	-0.427841	0.321648
2024-03-26	-0.032708	0.234343	0.131651	0.109342	-0.556130	-0.244717	-0.377797	0.183364	-0.577464	0.317533	...	0.150958	-0.070195	0.960796	-1.177043	-0.384593	0.306386	-0.206326	-0.246683	0.237594	-0.878121
2024-03-27	-0.363721	1.052056	-0.024166	-0.010591	0.315892	0.224881	2.192442	1.128111	1.411127	-0.309662	...	0.034693	0.474269	0.396878	1.991081	0.941800	-0.265792	1.179504	1.004419	-0.793726	2.193537
2024-03-28	-0.048375	0.411934	-0.078409	0.019962	0.022729	0.067776	1.190631	-0.583106	1.407557	-0.141870	...	0.577964	0.645940	-0.749067	0.514492	0.585964	0.026648	1.496052	0.367481	-0.251144	0.527595

Omnibus:	14.605	Durbin-Watson:	1.977
Prob(Omnibus):	0.001	Jarque-Bera (JB):	25.419
Skew:	-0.323	Prob(JB):	3.02e-06
Kurtosis:	4.415	Cond. No.	5.07

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-0.0005	0.001	-0.874	0.383	-0.001	0.001
Factor 1	-0.0015	0.000	-14.408	0.000	-0.002	-0.001
Factor 2	0.0002	0.000	0.981	0.327	-0.000	0.001
Factor 3	0.0012	0.000	4.741	0.000	0.001	0.002
Factor 4	-0.0003	0.000	-0.954	0.341	-0.001	0.000
Factor 5	-0.0014	0.000	-4.351	0.000	-0.002	-0.001
Factor 6	-0.0006	0.000	-1.632	0.104	-0.001	0.000
Factor 7	-0.0010	0.000	-2.862	0.005	-0.002	-0.000
Factor 8	-0.0024	0.000	-6.338	0.000	-0.003	-0.002
Factor 9	2.647e-05	0.000	0.070	0.944	-0.001	0.001
Factor 10	-0.0017	0.000	-4.295	0.000	-0.002	-0.001
Factor 11	0.0015	0.000	3.786	0.000	0.001	0.002
Factor 12	-0.0012	0.000	-2.866	0.005	-0.002	-0.000
Factor 13	-3.566e-05	0.000	-0.085	0.933	-0.001	0.001
Factor 14	-0.0004	0.000	-0.860	0.390	-0.001	0.000
Factor 15	0.0020	0.000	4.411	0.000	0.001	0.003
Factor 16	0.0007	0.000	1.490	0.138	-0.000	0.002
Factor 17	0.0006	0.000	1.202	0.231	-0.000	0.001
Factor 18	0.0004	0.000	0.915	0.361	-0.000	0.001
Factor 19	-0.0005	0.000	-1.138	0.256	-0.001	0.000
Factor 20	-0.0004	0.000	-0.853	0.395	-0.001	0.001

Date	2023-03-29	2023-03-30	2023-03-31	2023-04-03	2023-04-04	2023-04-05	2023-04-06	2023-04-10	2023-04-11	2023-04-12	...	2024-03-15	2024-03-18	2024-03-19	2024-03-20	2024-03-21	2024-03-22	2024-03-25	2024-03-26	2024-03-27	2024-03-28
TTWO	0.517876	-0.109884	1.337544	-0.377687	1.434972	-0.396598	-1.058358	0.968511	-0.393230	-0.425128	...	-0.009507	0.230010	0.174177	1.277133	1.455687	0.046870	-2.575803	0.150958	0.034693	0.577964
CDW	0.828280	0.554179	1.662702	-0.526449	-1.683846	-0.575859	-1.011494	1.226798	-0.297176	-0.249831	...	-0.372115	-0.237388	0.746046	0.685481	1.011843	0.135679	-0.593217	-0.024880	0.926266	-0.661728
VRSK	1.105776	0.239468	0.325258	-0.214460	-0.538698	0.312297	-0.561815	-0.160325	0.460051	0.241987	...	0.540265	0.364289	0.547444	-0.469841	-0.268086	-0.476030	-1.151639	-0.384593	0.941800	0.585964
AZN	0.032577	0.581046	0.210087	0.511913	0.342474	1.845515	0.281773	-0.279871	0.100865	1.575226	...	-0.798037	-0.511905	0.001356	-0.152553	0.497053	0.066928	-0.446724	0.507049	2.097049	-0.550594
KHC	0.660937	0.395396	-0.408666	0.419092	-0.573693	0.965708	0.274338	-0.101190	0.344085	0.062529	...	0.126003	1.459084	0.911082	0.200358	0.640640	0.636088	0.836122	0.473778	0.395669	0.920775
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
PANW	0.181513	0.170956	0.996037	-0.639367	-0.029813	-0.473513	-0.456103	0.051163	-0.035997	0.359068	...	-0.698661	0.295760	-0.614807	0.106095	0.592057	-0.167693	-0.268180	0.120918	-0.585240	0.152401
CPRT	0.819742	-0.718921	1.640747	0.127135	0.045753	-1.137620	-0.449916	0.456523	0.633908	0.255425	...	-0.535086	-0.199247	0.098243	0.704098	0.268083	0.119997	-0.449088	-0.092123	-0.118674	0.755961
BIIB	1.022446	0.339255	0.787943	-0.117509	-0.468927	2.262874	0.743043	-0.900157	0.259082	0.569755	...	-0.043094	0.083584	-0.188971	-0.097608	0.883756	-0.779638	-0.394465	-1.443454	1.471780	-0.173139
ROP	0.456743	0.531664	0.777325	-0.092862	-1.025041	0.143535	-0.528477	0.014353	0.482718	0.793161	...	-0.064633	-0.347129	0.990282	0.303363	0.307437	-0.097867	-1.152330	0.553945	0.790399	0.197501
FAST	0.383186	-0.367201	1.573852	-0.303210	-1.368706	-2.423653	-0.185549	0.943881	0.619980	-0.291865	...	-0.396009	0.582612	0.405553	1.397994	0.651785	-0.468057	-1.265598	-0.379706	0.387375	-0.284817

	NVDA	NVDA predicted	Residual
Date
2023-03-29	0.552528	0.989178	-0.436650
2023-03-30	0.318767	0.681753	-0.362986
2023-03-31	0.305373	0.943455	-0.638082
2023-04-03	0.048987	0.075185	-0.026198
2023-04-04	-0.794750	-0.560682	-0.234068
...	...	...	...
2024-03-22	0.871467	0.091508	0.779960
2024-03-25	0.075687	-0.526495	0.602183
2024-03-26	-1.043244	-0.587622	-0.455622
2024-03-27	-1.018786	0.460278	-1.479064
2024-03-28	-0.139221	-0.470273	0.331052

Machine Learning and Finance ¶

CourseWork 2024 - StatArb ¶

Overview¶

Coursework Goal¶

Outline¶

Objectives¶

1. Data Preparation and Exploration¶

2. Fama French Analysis¶

3. PCA Analysis¶

4. Ornstein-Uhlenbeck Process¶

5. Building a Basic Autoencoder Model¶

Libraries¶

Data Preparation and Exploration¶

Q2: (1 Mark) Given a list of tuples representing NASDAQ-100 companies (where each tuple contains a company name and its ticker symbol), write a Python script to extract all ticker symbols into a separate list called tickers_list.

Fama French Analysis¶

Additional Factor¶

Mathematical Representation¶

PCA Analysis¶

Representation of Stock Return Data¶

Standardization of Returns¶

Empirical Correlation Matrix¶

Singular Value Decomposition (SVD)¶

Construction of Hidden Factors¶

Ornstein Uhlenbeck¶

The Ornstein-Uhlenbeck Process¶

Modeling the Ornstein-Uhlenbeck Process using Stochastic Differential Equations¶

Obtaining the Moments of the Ornstein-Uhlenbeck Process¶

Autoencoder Analysis¶

Architecture¶

Training¶

Hidden Factors Extraction¶

Summarize the Key Actions: Highlight the main experiments and methodologies employed by the authors in Section 5.¶

Reproduction Steps: Detail the necessary steps required to replicate the papers' approach based on the descriptions provided in the paper.¶

Proposed Improvement: Suggest one potential enhancement to the methodology that could potentially increase the effectiveness or efficiency of the model.¶

Q2: (1 Mark)
Given a list of tuples representing NASDAQ-100 companies (where each tuple contains a company name and its ticker symbol), write a Python script to extract all ticker symbols into a separate list called `tickers_list`.