From d05dca4f28380a67c25806a6ce79352cae041579 Mon Sep 17 00:00:00 2001
From: Ludovico Lemma <89784228+ludovicolemma@users.noreply.github.com>
Date: Fri, 12 Aug 2022 15:48:48 +0200
Subject: [PATCH] Add files via upload
---
notebooks/01_DevNotebook_LSTM_study.ipynb | 1161 ++++++++++
...book_Debugging_classes_and_functions.ipynb | 1986 +++++++++++++++++
2 files changed, 3147 insertions(+)
create mode 100644 notebooks/01_DevNotebook_LSTM_study.ipynb
create mode 100644 notebooks/02_DevNotebook_Debugging_classes_and_functions.ipynb
diff --git a/notebooks/01_DevNotebook_LSTM_study.ipynb b/notebooks/01_DevNotebook_LSTM_study.ipynb
new file mode 100644
index 0000000..dade8ac
--- /dev/null
+++ b/notebooks/01_DevNotebook_LSTM_study.ipynb
@@ -0,0 +1,1161 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "01 - LSTM_study_notebook.ipynb",
+ "provenance": [],
+ "collapsed_sections": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "VXs2IK6k92Gh"
+ },
+ "outputs": [],
+ "source": [
+ "import datetime\n",
+ "import time\n",
+ "\n",
+ "import urllib.request\n",
+ "import os\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "from sklearn.preprocessing import MinMaxScaler\n",
+ "\n",
+ "from tensorflow.keras.models import load_model\n",
+ "from tensorflow.keras.models import Sequential\n",
+ "from tensorflow.keras.layers import Dense\n",
+ "from tensorflow.keras.layers import LSTM\n",
+ "from tensorflow.keras.layers import Dropout\n",
+ "\n",
+ "def yahoo_finance_csv(code, start_from_date = '2010-07-01', end_to_date = datetime.date.today().isoformat(), interval = 'd'): #other intervals are 'wk' and 'mo'\n",
+ "\n",
+ " #setting an header to avoid possible errors\n",
+ " opener = urllib.request.build_opener()\n",
+ " opener.addheaders = [('User-Agent','Mozilla/5.0')]\n",
+ " urllib.request.install_opener(opener)\n",
+ " \n",
+ " #converting dates from iso format\n",
+ " start_from_date = datetime.datetime.fromisoformat(start_from_date)\n",
+ " end_to_date = datetime.datetime.fromisoformat(end_to_date)\n",
+ " \n",
+ " #converting dates to unix time\n",
+ " start_from_code = int(time.mktime(start_from_date.timetuple())) #1277942400 is the code of the 1st of july of 2010 / CHOOSE A CLOSER DATE IF THE CHANGE IN PRICE LEVEL IS SUBSTANTIAL\n",
+ " todays_code = int(time.mktime(end_to_date.timetuple()))\n",
+ "\n",
+ " url = f'https://query1.finance.yahoo.com/v7/finance/download/{code}?period1={start_from_code}&period2={todays_code}&interval=1{interval}&events=history&includeAdjustedClose=true'\n",
+ "\n",
+ " #Save as a temporary file and return the a (position, HTTP message)\n",
+ " save_to_path = urllib.request.urlretrieve(url)\n",
+ " \n",
+ " pos_saved_csv = save_to_path[0]\n",
+ "\n",
+ " return pos_saved_csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# DOWNLOADING AND PREPARATION THE DATA"
+ ],
+ "metadata": {
+ "id": "YY4LXU2T05KR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#DOWNLOAD the prices with {code}\n",
+ "code = 'BTC-EUR'\n",
+ "df = pd.read_csv(yahoo_finance_csv(code))"
+ ],
+ "metadata": {
+ "id": "j1E6bt36-ZXo"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#dropping null rows (sometimes yahoo finance has them)\n",
+ "df = df.dropna().reset_index(drop=True)"
+ ],
+ "metadata": {
+ "id": "ZgobUz3mnq0K"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 424
+ },
+ "id": "U0RbsiZJ3Jn_",
+ "outputId": "43363d13-193c-4742-a77a-54e771b3e326"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Date Open High Low Close \\\n",
+ "0 2014-09-17 359.546204 361.468506 351.586884 355.957367 \n",
+ "1 2014-09-18 355.588409 355.505402 319.789459 328.539368 \n",
+ "2 2014-09-19 328.278503 330.936707 298.921021 307.761139 \n",
+ "3 2014-09-20 307.665253 329.978180 303.931244 318.758972 \n",
+ "4 2014-09-21 318.120514 321.504517 306.502197 310.632446 \n",
+ "... ... ... ... ... ... \n",
+ "2881 2022-08-07 22541.207031 22955.015625 22473.525391 22783.662109 \n",
+ "2882 2022-08-08 22787.236328 23732.878906 22792.974609 23347.226563 \n",
+ "2883 2022-08-09 23349.183594 23441.267578 22445.599609 22691.765625 \n",
+ "2884 2022-08-10 22690.375000 23377.703125 22313.289063 23252.611328 \n",
+ "2885 2022-08-11 23288.640625 23976.431641 23269.263672 23395.583984 \n",
+ "\n",
+ " Adj Close Volume \n",
+ "0 355.957367 16389166 \n",
+ "1 328.539368 26691849 \n",
+ "2 307.761139 29560103 \n",
+ "3 318.758972 28736826 \n",
+ "4 310.632446 20702625 \n",
+ "... ... ... \n",
+ "2881 22783.662109 15617948551 \n",
+ "2882 23347.226563 28020750644 \n",
+ "2883 22691.765625 23075182547 \n",
+ "2884 23252.611328 31884390941 \n",
+ "2885 23395.583984 35669487616 \n",
+ "\n",
+ "[2886 rows x 7 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Date | \n",
+ " Open | \n",
+ " High | \n",
+ " Low | \n",
+ " Close | \n",
+ " Adj Close | \n",
+ " Volume | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2014-09-17 | \n",
+ " 359.546204 | \n",
+ " 361.468506 | \n",
+ " 351.586884 | \n",
+ " 355.957367 | \n",
+ " 355.957367 | \n",
+ " 16389166 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2014-09-18 | \n",
+ " 355.588409 | \n",
+ " 355.505402 | \n",
+ " 319.789459 | \n",
+ " 328.539368 | \n",
+ " 328.539368 | \n",
+ " 26691849 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2014-09-19 | \n",
+ " 328.278503 | \n",
+ " 330.936707 | \n",
+ " 298.921021 | \n",
+ " 307.761139 | \n",
+ " 307.761139 | \n",
+ " 29560103 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2014-09-20 | \n",
+ " 307.665253 | \n",
+ " 329.978180 | \n",
+ " 303.931244 | \n",
+ " 318.758972 | \n",
+ " 318.758972 | \n",
+ " 28736826 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2014-09-21 | \n",
+ " 318.120514 | \n",
+ " 321.504517 | \n",
+ " 306.502197 | \n",
+ " 310.632446 | \n",
+ " 310.632446 | \n",
+ " 20702625 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2881 | \n",
+ " 2022-08-07 | \n",
+ " 22541.207031 | \n",
+ " 22955.015625 | \n",
+ " 22473.525391 | \n",
+ " 22783.662109 | \n",
+ " 22783.662109 | \n",
+ " 15617948551 | \n",
+ "
\n",
+ " \n",
+ " 2882 | \n",
+ " 2022-08-08 | \n",
+ " 22787.236328 | \n",
+ " 23732.878906 | \n",
+ " 22792.974609 | \n",
+ " 23347.226563 | \n",
+ " 23347.226563 | \n",
+ " 28020750644 | \n",
+ "
\n",
+ " \n",
+ " 2883 | \n",
+ " 2022-08-09 | \n",
+ " 23349.183594 | \n",
+ " 23441.267578 | \n",
+ " 22445.599609 | \n",
+ " 22691.765625 | \n",
+ " 22691.765625 | \n",
+ " 23075182547 | \n",
+ "
\n",
+ " \n",
+ " 2884 | \n",
+ " 2022-08-10 | \n",
+ " 22690.375000 | \n",
+ " 23377.703125 | \n",
+ " 22313.289063 | \n",
+ " 23252.611328 | \n",
+ " 23252.611328 | \n",
+ " 31884390941 | \n",
+ "
\n",
+ " \n",
+ " 2885 | \n",
+ " 2022-08-11 | \n",
+ " 23288.640625 | \n",
+ " 23976.431641 | \n",
+ " 23269.263672 | \n",
+ " 23395.583984 | \n",
+ " 23395.583984 | \n",
+ " 35669487616 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2886 rows × 7 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#splitting and Training and Test data\n",
+ "training_to_test_ratio = 0.7\n",
+ "split_val = round(len(df)*training_to_test_ratio)\n",
+ "print(split_val, 'on', len(df))\n",
+ "\n",
+ "training_set = df.iloc[:split_val-1]\n",
+ "test_set = df.iloc[split_val:]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3JSqnSi3BACN",
+ "outputId": "aec1f918-2cbe-4265-d410-c7f292a22b30"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "2020 on 2886\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#checking if it corresponds for applying it later in the resulting plot\n",
+ "pd.DatetimeIndex(df['Date']).year[np.arange(0, 2500, 500)]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7g87aliR3xnp",
+ "outputId": "a0b9b78c-e5bd-419a-bb14-523490e12523"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Int64Index([2014, 2016, 2017, 2018, 2020], dtype='int64', name='Date')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#plotting training and test data\n",
+ "fig, axs = plt.subplots(1,3,figsize=(25,5))\n",
+ "\n",
+ "df.plot(x='Date', y=['Open'], ax=axs[0], title=f'{code} Price', xticks=np.arange(0, len(df), 600))\n",
+ "training_set.plot(x='Date', y=['Open'], ax=axs[1], title=f'{code} Price (training set)', xticks=np.arange(0, len(training_set), 500))\n",
+ "test_set.plot(x='Date', y=['Open'], ax=axs[2], title=f'{code} Price (test set)', xticks=np.arange(0, len(test_set), 250))\n",
+ "\n",
+ "axs[0].axvline(split_val, c='k', linestyle='--') \n",
+ "axs[0].text(round(split_val/2), df['Open'].max()*0.99, s='Training Set', horizontalalignment='center')\n",
+ "axs[0].text(round((split_val+len(df))/2), df['Open'].min(), s='Test Set', horizontalalignment='center')\n",
+ "plt.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 350
+ },
+ "id": "jv6kKnJVBhl_",
+ "outputId": "a6fa78e7-e2b8-44af-de48-dcca6e3294e3"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#scaling the training set\n",
+ "scale = MinMaxScaler(feature_range = (0, 1))\n",
+ "X_train_scaled = scale.fit_transform(training_set.values[:,1].reshape(-1,1))"
+ ],
+ "metadata": {
+ "id": "wFZD0yv5CSPJ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#building the training data sequences from the original data\n",
+ "\n",
+ "X_train = []\n",
+ "y_train = []\n",
+ "\n",
+ "time_stamps = 30 #for each record there will be that number of time stamps\n",
+ "days = 1 #1 = next day, predict the value there will be in the next number of days\n",
+ "\n",
+ "#example (time stamps 30, days 2):\n",
+ "#the previous month will be consider to predict the value in the next 2nd day\n",
+ "days_forward = days-1 #for coding purposes 0 is 1 day forward, 1 is two day forward\n",
+ "\n",
+ "for i in range(time_stamps, split_val-1-days_forward):\n",
+ " X_train.append(X_train_scaled[i-time_stamps:i, 0])\n",
+ " y_train.append(X_train_scaled[i+days_forward, 0])\n",
+ " \n",
+ "X_train, y_train = np.array(X_train), np.array(y_train)"
+ ],
+ "metadata": {
+ "id": "LESGa0QTEPOC"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#Check the outlook of the sequence\n",
+ "X_train_scaled[0:time_stamps, 0]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ySIvEMegtHmU",
+ "outputId": "726ec82a-ba85-4753-c17f-2bc8a13b7928"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([0.01275208, 0.01251108, 0.01084809, 0.00959288, 0.01022954,\n",
+ " 0.0097868 , 0.00991238, 0.01151062, 0.0110267 , 0.01050005,\n",
+ " 0.01023509, 0.01003894, 0.00896653, 0.00890624, 0.00954019,\n",
+ " 0.00938463, 0.00889113, 0.00836913, 0.00686196, 0.0064508 ,\n",
+ " 0.00678027, 0.00702242, 0.00771841, 0.00836187, 0.00828608,\n",
+ " 0.00834608, 0.00906595, 0.00960925, 0.01017264, 0.00959721])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#it should be the same as before\n",
+ "X_train[0].ravel()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "--ssuwfu7E1Q",
+ "outputId": "add7c82a-c88a-487b-f365-b219b2d0ddbd"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([0.01275208, 0.01251108, 0.01084809, 0.00959288, 0.01022954,\n",
+ " 0.0097868 , 0.00991238, 0.01151062, 0.0110267 , 0.01050005,\n",
+ " 0.01023509, 0.01003894, 0.00896653, 0.00890624, 0.00954019,\n",
+ " 0.00938463, 0.00889113, 0.00836913, 0.00686196, 0.0064508 ,\n",
+ " 0.00678027, 0.00702242, 0.00771841, 0.00836187, 0.00828608,\n",
+ " 0.00834608, 0.00906595, 0.00960925, 0.01017264, 0.00959721])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#The training sequence is until the time_stamps, the target is the last element days_forward\n",
+ "X_train_scaled[0:time_stamps+days_forward+1, 0]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "eJ8yY8EB4Wa5",
+ "outputId": "ff563c29-ee4f-42e3-9b2f-bbd823235733"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([0.01275208, 0.01251108, 0.01084809, 0.00959288, 0.01022954,\n",
+ " 0.0097868 , 0.00991238, 0.01151062, 0.0110267 , 0.01050005,\n",
+ " 0.01023509, 0.01003894, 0.00896653, 0.00890624, 0.00954019,\n",
+ " 0.00938463, 0.00889113, 0.00836913, 0.00686196, 0.0064508 ,\n",
+ " 0.00678027, 0.00702242, 0.00771841, 0.00836187, 0.00828608,\n",
+ " 0.00834608, 0.00906595, 0.00960925, 0.01017264, 0.00959721,\n",
+ " 0.00906268])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#check if the target corresponds for the time_stamp sequence\n",
+ "X_train_scaled[time_stamps+days_forward]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3Z849PXItDOK",
+ "outputId": "addef356-16e5-443b-f684-3ef2a7a72e91"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([0.00906268])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#it should be the same as before\n",
+ "y_train[0]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UEyt5l7sqXgT",
+ "outputId": "1d03cbd1-bef3-4136-abf6-2bbcb7c2b0d2"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0.009062676067257507"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#reshaping the training set for LSTM\n",
+ "X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))"
+ ],
+ "metadata": {
+ "id": "4TVi8PCyEkrH"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# BUILDING THE MODEL"
+ ],
+ "metadata": {
+ "id": "CeboMNqK0tfK"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "n_epochs = 15 #considering the average resulting loss it's not that better usually an higher n°\n",
+ "n_layers = 4"
+ ],
+ "metadata": {
+ "id": "WIdN5rXk4sej"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "n_layers -= 1\n",
+ "#initialising the model\n",
+ "model = Sequential()\n",
+ "\n",
+ "#adding n° LSTM layers\n",
+ "model.add(LSTM(units = 50, return_sequences = True, input_shape = (time_stamps, 1)))\n",
+ "model.add(Dropout(0.2))\n",
+ "for i in range(n_layers):\n",
+ " if i < (n_layers-1):\n",
+ " return_sequences = True\n",
+ " else:\n",
+ " return_sequences = False\n",
+ " model.add(LSTM(units = 50, return_sequences = return_sequences))\n",
+ " model.add(Dropout(0.2))\n",
+ "\n",
+ "#adding the output dense layer\n",
+ "model.add(Dense(units = 1))\n",
+ "\n",
+ "#cmpiling the model with adam and mse\n",
+ "model.compile(optimizer = 'adam', loss = 'mean_squared_error')"
+ ],
+ "metadata": {
+ "id": "BDg8YlglJeyM"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#training on the training data\n",
+ "model.fit(X_train, y_train, epochs = n_epochs, batch_size = 32)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wkxKfsQHEaaR",
+ "outputId": "0c722e0a-f765-4f11-9647-208d26e6538c"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Epoch 1/15\n",
+ "63/63 [==============================] - 19s 65ms/step - loss: 0.0116\n",
+ "Epoch 2/15\n",
+ "63/63 [==============================] - 4s 70ms/step - loss: 0.0035\n",
+ "Epoch 3/15\n",
+ "63/63 [==============================] - 4s 65ms/step - loss: 0.0028\n",
+ "Epoch 4/15\n",
+ "63/63 [==============================] - 4s 65ms/step - loss: 0.0028\n",
+ "Epoch 5/15\n",
+ "63/63 [==============================] - 4s 66ms/step - loss: 0.0027\n",
+ "Epoch 6/15\n",
+ "63/63 [==============================] - 4s 65ms/step - loss: 0.0021\n",
+ "Epoch 7/15\n",
+ "63/63 [==============================] - 4s 64ms/step - loss: 0.0021\n",
+ "Epoch 8/15\n",
+ "63/63 [==============================] - 4s 64ms/step - loss: 0.0020\n",
+ "Epoch 9/15\n",
+ "63/63 [==============================] - 4s 64ms/step - loss: 0.0020\n",
+ "Epoch 10/15\n",
+ "63/63 [==============================] - 4s 65ms/step - loss: 0.0017\n",
+ "Epoch 11/15\n",
+ "63/63 [==============================] - 4s 66ms/step - loss: 0.0018\n",
+ "Epoch 12/15\n",
+ "63/63 [==============================] - 4s 65ms/step - loss: 0.0019\n",
+ "Epoch 13/15\n",
+ "63/63 [==============================] - 4s 65ms/step - loss: 0.0018\n",
+ "Epoch 14/15\n",
+ "63/63 [==============================] - 4s 66ms/step - loss: 0.0016\n",
+ "Epoch 15/15\n",
+ "63/63 [==============================] - 4s 66ms/step - loss: 0.0016\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "model.summary()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "yg6MRToSPGpf",
+ "outputId": "32d615f6-c039-4234-e5b7-5294e6dc522d"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Model: \"sequential\"\n",
+ "_________________________________________________________________\n",
+ " Layer (type) Output Shape Param # \n",
+ "=================================================================\n",
+ " lstm (LSTM) (None, 30, 50) 10400 \n",
+ " \n",
+ " dropout (Dropout) (None, 30, 50) 0 \n",
+ " \n",
+ " lstm_1 (LSTM) (None, 30, 50) 20200 \n",
+ " \n",
+ " dropout_1 (Dropout) (None, 30, 50) 0 \n",
+ " \n",
+ " lstm_2 (LSTM) (None, 30, 50) 20200 \n",
+ " \n",
+ " dropout_2 (Dropout) (None, 30, 50) 0 \n",
+ " \n",
+ " lstm_3 (LSTM) (None, 50) 20200 \n",
+ " \n",
+ " dropout_3 (Dropout) (None, 50) 0 \n",
+ " \n",
+ " dense (Dense) (None, 1) 51 \n",
+ " \n",
+ "=================================================================\n",
+ "Total params: 71,051\n",
+ "Trainable params: 71,051\n",
+ "Non-trainable params: 0\n",
+ "_________________________________________________________________\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#checking the results of prediction on the training set\n",
+ "moving_horizontally = np.arange(time_stamps, len(training_set)-days_forward)\n",
+ "train_prediction = scale.inverse_transform(model.predict(X_train))\n",
+ "\n",
+ "plt.plot(training_set.values[:,1], color = 'blue', label = 'Real Price')\n",
+ "plt.plot(moving_horizontally, train_prediction, color = 'red', label = 'Predicted Price')\n",
+ "plt.title(f'{code} Price Prediction (Training Data)')\n",
+ "plt.xlabel('Time')\n",
+ "plt.ylabel('Price')\n",
+ "\n",
+ "ticks_freq = 500\n",
+ "plt.xticks(np.arange(0, len(training_set), ticks_freq), training_set['Date'][np.arange(training_set.index[0], training_set.index[-1], ticks_freq)])\n",
+ "\n",
+ "#here the prediction starts later because of the time stamp selected needed for previous predictions\n",
+ "plt.axvline(time_stamps, c='k', linestyle='--') \n",
+ "plt.legend()\n",
+ "plt.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 295
+ },
+ "id": "vgvWkROTxx5i",
+ "outputId": "5c3f2c12-c01c-4f09-e47b-4cb7ee6d5283"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#getting the test set in the same format of the training (meaning a sequence composed of n° of time_stamps)\n",
+ "to_test = pd.concat((training_set[-time_stamps:], test_set))['Open'].values\n",
+ "to_test = to_test.reshape(-1,1)\n",
+ "to_test = scale.transform(to_test)\n",
+ "\n",
+ "X_test = []\n",
+ "for i in range(time_stamps, len(test_set)+time_stamps):\n",
+ " X_test.append(to_test[i-time_stamps:i, 0])\n",
+ "X_test = np.array(X_test)\n",
+ "X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))\n",
+ "\n",
+ "#rescaling the data\n",
+ "y_pred = model.predict(X_test)\n",
+ "test_predictions = scale.inverse_transform(y_pred)"
+ ],
+ "metadata": {
+ "id": "DgXi07h5qw74"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#plotting the results\n",
+ "plt.plot(test_set.values[:,1], color = 'blue', label = 'Real Price')\n",
+ "plt.plot(test_predictions, color = 'red', label = 'Predicted Price')\n",
+ "plt.title(f'{code} Price Prediction (Test Data)')\n",
+ "plt.xlabel('Time')\n",
+ "plt.ylabel('Price')\n",
+ "\n",
+ "ticks_freq = 200\n",
+ "plt.xticks(np.arange(0, len(test_set), ticks_freq), test_set['Date'][np.arange(test_set.index[0], test_set.index[-1], ticks_freq)])\n",
+ "plt.legend()\n",
+ "plt.show()"
+ ],
+ "metadata": {
+ "id": "4DP0yVlXOn8F",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 295
+ },
+ "outputId": "c953f892-3095-4ccd-f291-e114a3f8a145"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Extra: Testing Rolling predictions"
+ ],
+ "metadata": {
+ "id": "3c_IDwtyeugJ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#It's useless as it should be possible to access the data day by day\n",
+ "#but I did it as it is usefull to understand the underlying mechanism with certain sequences\n",
+ "\n",
+ "#I needed to set the first sequence of n° time_stamps\n",
+ "first_sequence = X_test[0].reshape(1,time_stamps,1) #reshaping to the format needed by the model\n",
+ "\n",
+ "#defining a function that gives a list of subsequent preidction of the same length of the first sequence\n",
+ "def predict_next(first_sequence, model = model):\n",
+ " rolling = [x for x in first_sequence.ravel()]\n",
+ " prediction_sequence = []\n",
+ "\n",
+ " for x in range(0, len(rolling)):\n",
+ " sequence_to_use = rolling[x:]+prediction_sequence\n",
+ " sequence_to_use = np.array(sequence_to_use).reshape(1,time_stamps,1)\n",
+ " predicted_value = model.predict(sequence_to_use)[0][0]\n",
+ " prediction_sequence.append(predicted_value)\n",
+ " \n",
+ " return prediction_sequence\n",
+ "\n",
+ "#defining a function that loop over the previous one to give me the complete sequence\n",
+ "def predict_long(first_sequence, loop = 3): #if the length of first sequence is 30, then loop 3 is around three months\n",
+ " grand_sequence = []\n",
+ " \n",
+ " sequence_to_use = first_sequence\n",
+ "\n",
+ " for i in range(loop):\n",
+ " grand_sequence.append(predict_next(sequence_to_use))\n",
+ " sequence_to_use = np.array(grand_sequence[i]).reshape(1,time_stamps,1)\n",
+ "\n",
+ " return grand_sequence"
+ ],
+ "metadata": {
+ "id": "3dQkDewMe2LO"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import itertools\n",
+ "\n",
+ "grand_sequence = predict_long(first_sequence, loop = 25)\n",
+ "delist = list(itertools.chain.from_iterable(grand_sequence))\n",
+ "predicted_grand_sequence = scale.inverse_transform(np.array(delist).reshape(-1,1)).ravel()"
+ ],
+ "metadata": {
+ "id": "tKEd5oTGkGeS"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print('test_set:', len(test_set), '\\nlist_to_plot:', len(predicted_grand_sequence))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3HhLnOufpHbT",
+ "outputId": "e489f50e-c80c-49ec-8e7e-dab0be5d8bbc"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_set: 866 \n",
+ "list_to_plot: 750\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import matplotlib.patches as patches\n",
+ "\n",
+ "#Plotting the results\n",
+ "fig, ax = plt.subplots()\n",
+ "\n",
+ "horizontal_shift = np.arange(time_stamps, time_stamps+len(predicted_grand_sequence))\n",
+ "slice_origin = slice(split_val-time_stamps, split_val+len(predicted_grand_sequence))\n",
+ "real_grand_sequence = df['Open'].values[slice_origin]\n",
+ "\n",
+ "ax.plot(real_grand_sequence, color = 'blue', label = 'Real Price')\n",
+ "ax.plot(horizontal_shift, predicted_grand_sequence, color = 'red', label = 'Rolling Prediction')\n",
+ "ax.set_title(f'{code} Rolling Prediction (Test Data)')\n",
+ "ax.set_xlabel('Time')\n",
+ "ax.set_ylabel('Price')\n",
+ "\n",
+ "ticks_freq = 200\n",
+ "ax.set_xticks(np.arange(0, len(test_set), ticks_freq))\n",
+ "ax.set_xticklabels(test_set['Date'][np.arange(test_set.index[0], test_set.index[-1], ticks_freq)])\n",
+ "\n",
+ "#Here I created a rectangle comprising the first original (blue) sequence from which the rolling prediction came\n",
+ "first_original = real_grand_sequence[:time_stamps] #the data to surround\n",
+ "xpos = -len(real_grand_sequence)*0.01 #positioning 1% earlier than the first value to not be too tight\n",
+ "ypos = first_original.min()*0.99\n",
+ "width = time_stamps-xpos #it's not precise on a large scale thus I enlarge it by 1%\n",
+ "height = first_original.max()*1.01-ypos\n",
+ "\n",
+ "#Adding the rectangle\n",
+ "rect = patches.Rectangle((xpos, ypos), width, height, linewidth=1, edgecolor='k', facecolor='none', label = 'First Sequence')\n",
+ "ax.add_patch(rect)\n",
+ "\n",
+ "plt.legend()\n",
+ "plt.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 295
+ },
+ "id": "fCiOZLHiku8B",
+ "outputId": "f1037282-4694-4ef5-e31b-b196d4779f2d"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import matplotlib.patches as patches\n",
+ "\n",
+ "#Plotting the results\n",
+ "fig, ax = plt.subplots()\n",
+ "\n",
+ "horizontal_shift = np.arange(time_stamps, time_stamps+len(predicted_grand_sequence))\n",
+ "slice_origin = slice(split_val-time_stamps, split_val+len(predicted_grand_sequence))\n",
+ "real_grand_sequence = df['Open'].values[slice_origin]\n",
+ "\n",
+ "ax.plot(real_grand_sequence, color = 'blue', label = 'Real Price')\n",
+ "ax.plot(horizontal_shift, predicted_grand_sequence, color = 'red', label = 'Rolling Prediction')\n",
+ "ax.set_title(f'{code} Rolling Prediction (Test Data)')\n",
+ "ax.set_xlabel('Time')\n",
+ "ax.set_ylabel('Price')\n",
+ "\n",
+ "#Here I created a rectangle comprising the first original (blue) sequence from which the rolling prediction came\n",
+ "first_original = real_grand_sequence[:30] #the data to surround\n",
+ "xpos = -len(real_grand_sequence)*0.01 #positioning 1% earlier than the first value to not be too tight\n",
+ "ypos = first_original.min()*0.99\n",
+ "width = time_stamps-xpos #it's not precise on a large scale thus I enlarge it by 1%\n",
+ "height = first_original.max()*1.01-ypos\n",
+ "plt.xlim(-5, 100)\n",
+ "plt.ylim(ypos*0.98, (height+ypos)*1.05)\n",
+ "\n",
+ "plt.legend()\n",
+ "plt.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 295
+ },
+ "id": "J7dMJ8Hmw2qu",
+ "outputId": "5117a388-28c5-44f8-a6c9-ed37718f6751"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Conclusion: without constant update of the data upon to predict the prediction will probably (without retraining on the days_forward) predict along a curve of an activation function built initially upon the optimization of patterns similar to the time_stamps of training similar to the first sequence and then rolling along the last value with probably an identical output of the activation function"
+ ],
+ "metadata": {
+ "id": "5IinRPqgsMey"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/notebooks/02_DevNotebook_Debugging_classes_and_functions.ipynb b/notebooks/02_DevNotebook_Debugging_classes_and_functions.ipynb
new file mode 100644
index 0000000..085faaa
--- /dev/null
+++ b/notebooks/02_DevNotebook_Debugging_classes_and_functions.ipynb
@@ -0,0 +1,1986 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "VXs2IK6k92Gh"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import datetime\n",
+ "import time\n",
+ "import urllib.request\n",
+ "from argparse import ArgumentParser\n",
+ "from typing import Optional\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "from sklearn.preprocessing import MinMaxScaler\n",
+ "\n",
+ "from tensorflow.keras.models import load_model\n",
+ "from tensorflow.keras.models import Sequential\n",
+ "from tensorflow.keras.layers import Dense\n",
+ "from tensorflow.keras.layers import LSTM\n",
+ "from tensorflow.keras.layers import Dropout\n",
+ "\n",
+ "def yahoo_finance_csv(code : str, \n",
+ " start_from_date : str = '2010-07-01', \n",
+ " end_to_date : str = datetime.date.today().isoformat(), \n",
+ " interval : str = 'd') -> str: #other intervals are 'wk' and 'mo'\n",
+ "\n",
+ " #setting an header to avoid possible errors\n",
+ " opener = urllib.request.build_opener()\n",
+ " opener.addheaders = [('User-Agent','Mozilla/5.0')]\n",
+ " urllib.request.install_opener(opener)\n",
+ " \n",
+ " #converting dates from iso format\n",
+ " start_from_date = datetime.datetime.fromisoformat(start_from_date)\n",
+ " end_to_date = datetime.datetime.fromisoformat(end_to_date)\n",
+ "\n",
+ " #converting dates to unix time\n",
+ " start_from_code = int(time.mktime(start_from_date.timetuple()))\n",
+ " todays_code = int(time.mktime(end_to_date.timetuple()))\n",
+ "\n",
+ " url = f'https://query1.finance.yahoo.com/v7/finance/download/{code}?period1={start_from_code}&period2={todays_code}&interval=1{interval}&events=history&includeAdjustedClose=true'\n",
+ " \n",
+ " #Save as a temporary file and return the a (position, HTTP message)\n",
+ " save_to_path = urllib.request.urlretrieve(url)\n",
+ " \n",
+ " pos_saved_csv = save_to_path[0]\n",
+ "\n",
+ " return pos_saved_csv\n",
+ "\n",
+ "def quick_tomorrow(code : str, \n",
+ " plot : bool = True,\n",
+ " start_from_date : Optional[str] = None,\n",
+ " training_to_test_ratio : Optional[float] = None, \n",
+ " n_layers : Optional[int] = None,\n",
+ " n_epochs : Optional[int] = None) -> tuple:\n",
+ "\n",
+ " if start_from_date is None:\n",
+ " start_from_date = '2010-07-01'\n",
+ " if training_to_test_ratio is None:\n",
+ " training_to_test_ratio = 0.9\n",
+ " if n_layers is None:\n",
+ " n_layers = 4\n",
+ " if n_epochs is None:\n",
+ " n_epochs = 10\n",
+ "\n",
+ " fitted_model = Price_Predictor(code = code, start_from_date = start_from_date,\n",
+ " training_to_test_ratio = training_to_test_ratio, \n",
+ " n_layers = n_layers,\n",
+ " n_epochs = n_epochs, \n",
+ " fit_at_start = True)\n",
+ "\n",
+ " if plot == True:\n",
+ " fig, ax = plt.subplots(1, 2, figsize=(18,5))\n",
+ " fitted_model.plot_data(ax[0])\n",
+ " fitted_model.plot_results(ax[1])\n",
+ " plt.legend()\n",
+ " plt.show()\n",
+ "\n",
+ " tomorrows_value = fitted_model.predict(return_info = False)\n",
+ "\n",
+ " print(\"Last price was {price:.2f} on {date}\".format(price = fitted_model.df['Open'].values[-1],\n",
+ " date = fitted_model.df['Date'].values[-1]),\n",
+ " \"Next price is predicted to be {:2f}\".format(tomorrows_value), sep='\\n')\n",
+ " \n",
+ " return fitted_model, tomorrows_value\n",
+ "\n",
+ "class Price_Predictor():\n",
+ "\n",
+ " def __init__(self, code : str,\n",
+ " start_from_date : str = '2010-07-01',\n",
+ " end_to_date : str = datetime.date.today().isoformat(),\n",
+ " interval : str = 'd',\n",
+ " time_stamps : int = 30, \n",
+ " training_to_test_ratio : float = 0.7, \n",
+ " n_layers : int = 4, #minimum input is 2\n",
+ " n_epochs : int = 15,\n",
+ " verbose : int = 0,\n",
+ " load_model : bool = False, \n",
+ " path_load : str = 'model_saved',\n",
+ " fit_at_start : bool = False, \n",
+ " days_forward : int = 1): \n",
+ "\n",
+ " self.code = code\n",
+ " self.start_from_date = start_from_date\n",
+ " self.end_to_date = end_to_date\n",
+ " self.interval = interval\n",
+ " self.time_stamps = time_stamps\n",
+ " self.training_to_test_ratio = training_to_test_ratio\n",
+ " self.n_layers = max(2, n_layers) #it's useless if less than 2\n",
+ " self.n_epochs = n_epochs\n",
+ " self.verbose = 0 if verbose < 0 else 2 if verbose > 2 else verbose\n",
+ " self.load_model = load_model\n",
+ " self.path_load = path_load\n",
+ "\n",
+ " #get data and preprocessing\n",
+ " self.df = pd.read_csv(yahoo_finance_csv(code = code, \n",
+ " start_from_date = self.start_from_date,\n",
+ " end_to_date = self.end_to_date))\n",
+ " self.df = self.df.dropna().reset_index(drop=True)\n",
+ "\n",
+ " #set the scaler\n",
+ " self.scale = MinMaxScaler(feature_range = (0, 1))\n",
+ "\n",
+ " #split\n",
+ " self.split_val = round(len(self.df)*self.training_to_test_ratio)\n",
+ "\n",
+ " #setting the frequency of date ticks for the plots\n",
+ " self.data_ticks_freq = round(0.25 * self.split_val)\n",
+ " self.result_ticks_freq = round(0.25 * len(self.df)*(1-self.training_to_test_ratio))\n",
+ "\n",
+ " #initialize the model\n",
+ " self.model = self.model_initialize(self.n_layers)\n",
+ " if fit_at_start == True:\n",
+ " self.test_predictions = self.fit_and_test(days_forward, self.df, self.split_val)\n",
+ "\n",
+ "\n",
+ " def model_initialize(self, \n",
+ " n_layers : int, \n",
+ " optimizer : str = 'adam', \n",
+ " loss_metric : str = 'mean_squared_error') -> Sequential:\n",
+ " \n",
+ " if self.load_model and os.path.isdir(self.path_load):\n",
+ " print(f'Loading model from {self.path_load}')\n",
+ " model = load_model(self.path_load)\n",
+ " else:\n",
+ " n_layers = int(self.n_layers) - 1 #the input layer doesn't count\n",
+ " #initialising the model\n",
+ " model = Sequential()\n",
+ " #adding the LSTM layers\n",
+ " model.add(LSTM(units = 50, return_sequences = True, input_shape = (self.time_stamps, 1)))\n",
+ " model.add(Dropout(0.2))\n",
+ " for i in range(n_layers):\n",
+ " if i < n_layers-1:\n",
+ " return_sequences = True\n",
+ " else:\n",
+ " return_sequences = False\n",
+ " model.add(LSTM(units = 50, return_sequences = return_sequences))\n",
+ " model.add(Dropout(0.2))\n",
+ " #adding the output dense layer\n",
+ " model.add(Dense(units = 1))\n",
+ " #cmpiling the model with adam and mse\n",
+ " model.compile(optimizer = optimizer, loss = loss_metric)\n",
+ " \n",
+ " return model\n",
+ "\n",
+ " def fit_and_test(self, days_forward : int,\n",
+ " df : Optional[pd.DataFrame] = None, \n",
+ " split_val : Optional[int] = None) -> np.array: #this method is not meant to be run by the user\n",
+ " \n",
+ " if df is None:\n",
+ " df = self.df\n",
+ " if split_val is None:\n",
+ " split_val = self.split_val\n",
+ "\n",
+ " self.days_forward = max(0, days_forward-1)\n",
+ "\n",
+ " training_set = df.iloc[:split_val-1]\n",
+ " test_set = df.iloc[split_val:]\n",
+ "\n",
+ " X_train_scaled = self.scale.fit_transform(training_set['Open'].values.reshape(-1,1))\n",
+ "\n",
+ " X_train = []\n",
+ " y_train = []\n",
+ " for i in range(self.time_stamps, split_val-1-self.days_forward):\n",
+ " X_train.append(X_train_scaled[i-self.time_stamps:i, 0])\n",
+ " y_train.append(X_train_scaled[i+self.days_forward, 0])\n",
+ " X_train, y_train = np.array(X_train), np.array(y_train)\n",
+ " #reshaping the training set for LSTM\n",
+ " X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))\n",
+ "\n",
+ " if self.load_model == False:\n",
+ " #fitting on the training data\n",
+ " self.model.fit(X_train, y_train, epochs = self.n_epochs, batch_size = 32, verbose = self.verbose)\n",
+ "\n",
+ " #getting the test set in the same format of the training (meaning a sequence composed of n° of time_stamps)\n",
+ " to_test = pd.concat((training_set[-self.time_stamps:], test_set))['Open'].values\n",
+ " to_test = to_test.reshape(-1,1)\n",
+ " to_test = self.scale.transform(to_test)\n",
+ "\n",
+ " X_test = []\n",
+ " for i in range(self.time_stamps, len(test_set)+self.time_stamps):\n",
+ " X_test.append(to_test[i-self.time_stamps:i, 0])\n",
+ " X_test = np.array(X_test)\n",
+ " X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))\n",
+ "\n",
+ " #rescaling the data\n",
+ " y_pred = self.model.predict(X_test)\n",
+ " \n",
+ " self.test_predictions = self.scale.inverse_transform(y_pred)\n",
+ "\n",
+ " return self.test_predictions\n",
+ " \n",
+ " def save_model(self, dir : str = 'model_saved'):\n",
+ " if os.path.isdir(dir):\n",
+ " response = 0\n",
+ " while str(response).lower() != 'y' and str(response).lower() != 'n':\n",
+ " response = input('A model already exists, overwrite? y/n: ')\n",
+ " if str(response).lower() == 'y':\n",
+ " self.model.save(dir)\n",
+ " elif str(response).lower() == 'n':\n",
+ " rename = input('Rename? y/n: ')\n",
+ " if str(rename).lower() == 'y':\n",
+ " dir = input('Write a dir name: ')\n",
+ " self.model.save(dir)\n",
+ " else:\n",
+ " break\n",
+ " else:\n",
+ " self.model.save(dir)\n",
+ "\n",
+ " def plot_data(self, ax = None):\n",
+ " if ax is None:\n",
+ " fig, ax = plt.subplots(1, 1, figsize=(9,5))\n",
+ " self.df.plot(x='Date', y=['Open'], ax=ax, title=f'{self.code} Price', xticks=np.arange(0, len(self.df), self.data_ticks_freq))\n",
+ " ax.axvline(self.split_val, c='k', linestyle='--') \n",
+ " \n",
+ " xlim = ax.get_xlim()\n",
+ " ax.text(round((self.split_val+xlim[0])/2), self.df['Open'].max()*0.99, s='Training Set', horizontalalignment='center')\n",
+ " ax.text(round((self.split_val+xlim[1])/2), self.df['Open'].min(), s='Test Set', horizontalalignment='center')\n",
+ "\n",
+ " def plot_results(self, ax = None):\n",
+ " if ax is None:\n",
+ " fig, ax = plt.subplots(1, 1, figsize=(9,5))\n",
+ "\n",
+ " test_set = self.df.iloc[self.split_val:]\n",
+ "\n",
+ " ax.plot(test_set['Open'].values, color = 'blue', label = 'Real Price')\n",
+ " ax.plot(self.test_predictions, color = 'red', label = 'Predicted Price')\n",
+ " ax.set_title(f'{self.code} Price Prediction (Test Data)')\n",
+ " ax.set_xlabel('Time')\n",
+ " ax.set_ylabel('Price')\n",
+ "\n",
+ " ax.set_xticks(np.arange(0, len(test_set), self.result_ticks_freq))\n",
+ " ax.set_xticklabels(test_set['Date'][np.arange(test_set.index[0], test_set.index[-1], self.result_ticks_freq)])\n",
+ "\n",
+ " def predict(self, input_sequence = None, return_info : bool = True) -> float:\n",
+ " if input_sequence is None:\n",
+ " if return_info:\n",
+ " print(f'No Input sequence provided, the last {self.time_stamps} records of the data downloaded will be used instead.\\n')\n",
+ " input_sequence = self.df.iloc[self.split_val:]['Open'].values\n",
+ " input_sequence = np.array(input_sequence).ravel().astype('float32')\n",
+ " \n",
+ " if len(input_sequence) > self.time_stamps:\n",
+ " if return_info:\n",
+ " print(f'\\nWARNING: The input sequence on which to forecast is longer than {self.time_stamps}',\n",
+ " 'which is the input time stamp and the length of array needed in order to get a prediction,'\n",
+ " f'the last {self.time_stamps} records will be considered instead.\\n')\n",
+ " input_sequence = input_sequence[-self.time_stamps:]\n",
+ "\n",
+ " elif len(input_sequence) < self.time_stamps:\n",
+ " raise ValueError(f\"The array must be at least {self.time_stamps} in length\")\n",
+ " \n",
+ " scaled_sequence = self.scale.transform(input_sequence.reshape(-1,1))\n",
+ " X_to_predict = scaled_sequence.reshape(1,self.time_stamps,1)\n",
+ " y_predicted = self.model.predict(X_to_predict)\n",
+ " rescaled = self.scale.inverse_transform(y_predicted)\n",
+ "\n",
+ " if return_info:\n",
+ " print(f'In {self.days_forward+1} day(s) the price will be:', rescaled.ravel()[0])\n",
+ "\n",
+ " return rescaled.ravel()[0]\n",
+ "\n",
+ " def __get_data_frame__(self) -> pd.DataFrame:\n",
+ " return self.df\n",
+ "\n",
+ " def __get_training_set__(self) -> pd.DataFrame:\n",
+ " return self.df.iloc[:self.split_val-1]\n",
+ "\n",
+ " def __get_test_set__(self) -> pd.DataFrame:\n",
+ " return self.df.iloc[self.split_val:]\n",
+ "\n",
+ " def __get_params__(self) -> dict:\n",
+ " try:\n",
+ " self.days_forward\n",
+ " status_code = 0\n",
+ " except:\n",
+ " status_code = 1\n",
+ "\n",
+ " params = {'code' : self.code, \n",
+ " 'start_from_date' : self.start_from_date, \n",
+ " 'end_to_date' : self.end_to_date, \n",
+ " 'interval' : self.interval,\n",
+ " 'time_stamps' : self.time_stamps, \n",
+ " 'training_to_test_ratio' : self.training_to_test_ratio, \n",
+ " 'split_val' : self.split_val, \n",
+ " 'n_layers' : self.n_layers, \n",
+ " 'n_epochs': self.n_epochs, \n",
+ " 'load_model' : self.load_model, \n",
+ " 'path_load' : self.path_load}\n",
+ "\n",
+ " if status_code == 0:\n",
+ " params['days_forward'] = self.days_forward+1\n",
+ "\n",
+ " return params\n",
+ "\n",
+ "class Predict_Iterator(Price_Predictor):\n",
+ "\n",
+ " def __init__(self, code : str, \n",
+ " start_from_date : str = '2010-07-01', \n",
+ " end_to_date : str = datetime.date.today().isoformat(), \n",
+ " effort : float = 0.5, \n",
+ " time_stamps : int = 30):\n",
+ "\n",
+ " #the effort parameter increases the resulting performances while increasing the computational time, it is suggested to leave it as default\n",
+ " \n",
+ " training_to_test_ratio = round((0.9-0.70)*effort+0.70, 2) #I want a minimum of 0.70 and a maximum of 0.9\n",
+ " idx_epoch = 0 if effort <= 0.6 else 1 if effort <= 0.75 else 2 if effort <= 0.85 else 3\n",
+ " n_epochs = [2, 5, 10, 15][idx_epoch] #I generally prefer 2 to be quicker\n",
+ " n_layers = round(4 * effort) #more than 4 layers is not worth the effort\n",
+ "\n",
+ " super().__init__(code = code, \n",
+ " start_from_date = start_from_date, end_to_date = end_to_date, \n",
+ " time_stamps = time_stamps, \n",
+ " training_to_test_ratio = training_to_test_ratio, \n",
+ " n_layers = n_layers, n_epochs = n_epochs)\n",
+ "\n",
+ " def get_predictions(self, days_to_predict : int = 1,\n",
+ " predict_from_date : Optional[str] = None) -> list:\n",
+ "\n",
+ " if predict_from_date is not None:\n",
+ " end_date = datetime.datetime.fromisoformat(self.end_to_date)\n",
+ " start_date = datetime.datetime.fromisoformat(predict_from_date)\n",
+ " \n",
+ " if end_date > start_date:\n",
+ " position = self.df['Date'].tolist().index(predict_from_date)\n",
+ " input_sequence = self.df.iloc[position-self.time_stamps:position]['Open'].values\n",
+ "\n",
+ " else:\n",
+ " raise ValueError(\"The chosen 'predict_from_date' must be antecedent to the end date selected previously!\")\n",
+ " else:\n",
+ " input_sequence = self.df.iloc[self.split_val:]['Open'].values\n",
+ "\n",
+ " self.days_to_predict = max(1, days_to_predict)\n",
+ " predictions = []\n",
+ " self.stored_models = []\n",
+ "\n",
+ " for n in range(1, self.days_to_predict+1):\n",
+ " self.fit_and_test(days_forward = n)\n",
+ " self.stored_models.append(self)\n",
+ " price_predicted = self.predict(input_sequence = input_sequence, return_info = False)\n",
+ " predictions.append(price_predicted)\n",
+ " \n",
+ " return predictions\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ftsemib = Price_Predictor('FTSEMIB.MI')\n",
+ "ftsemib.plot_data()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 350
+ },
+ "id": "0hpp7rWo0GVL",
+ "outputId": "f1d5c59d-7984-4a0b-f8c2-3d27e8c31f04"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ftsemib.df"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 424
+ },
+ "id": "dyyPJ3eoyanQ",
+ "outputId": "46375a18-8d32-4f71-a838-e3e1213d5d56"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Date Open High Low Close Adj Close Volume\n",
+ "0 2010-07-01 19000.0 19254.0 18807.0 18944.0 18944.0 1.003573e+09\n",
+ "1 2010-07-02 19114.0 19257.0 18927.0 19074.0 19074.0 7.714960e+08\n",
+ "2 2010-07-05 19127.0 19127.0 18843.0 18849.0 18849.0 4.103019e+08\n",
+ "3 2010-07-06 18945.0 19577.0 18904.0 19357.0 19357.0 8.490529e+08\n",
+ "4 2010-07-07 19174.0 20013.0 19095.0 20013.0 20013.0 1.102459e+09\n",
+ "... ... ... ... ... ... ... ...\n",
+ "3072 2022-08-04 22650.0 22880.0 22590.0 22646.0 22646.0 4.562569e+08\n",
+ "3073 2022-08-05 22677.0 22732.0 22500.0 22587.0 22587.0 4.019544e+08\n",
+ "3074 2022-08-08 22722.0 22828.0 22560.0 22728.0 22728.0 2.927303e+08\n",
+ "3075 2022-08-09 22728.0 22771.0 22470.0 22488.0 22488.0 2.886964e+08\n",
+ "3076 2022-08-10 22419.0 22733.0 22354.0 22702.0 22702.0 2.790439e+08\n",
+ "\n",
+ "[3077 rows x 7 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Date | \n",
+ " Open | \n",
+ " High | \n",
+ " Low | \n",
+ " Close | \n",
+ " Adj Close | \n",
+ " Volume | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2010-07-01 | \n",
+ " 19000.0 | \n",
+ " 19254.0 | \n",
+ " 18807.0 | \n",
+ " 18944.0 | \n",
+ " 18944.0 | \n",
+ " 1.003573e+09 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2010-07-02 | \n",
+ " 19114.0 | \n",
+ " 19257.0 | \n",
+ " 18927.0 | \n",
+ " 19074.0 | \n",
+ " 19074.0 | \n",
+ " 7.714960e+08 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2010-07-05 | \n",
+ " 19127.0 | \n",
+ " 19127.0 | \n",
+ " 18843.0 | \n",
+ " 18849.0 | \n",
+ " 18849.0 | \n",
+ " 4.103019e+08 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2010-07-06 | \n",
+ " 18945.0 | \n",
+ " 19577.0 | \n",
+ " 18904.0 | \n",
+ " 19357.0 | \n",
+ " 19357.0 | \n",
+ " 8.490529e+08 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2010-07-07 | \n",
+ " 19174.0 | \n",
+ " 20013.0 | \n",
+ " 19095.0 | \n",
+ " 20013.0 | \n",
+ " 20013.0 | \n",
+ " 1.102459e+09 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 3072 | \n",
+ " 2022-08-04 | \n",
+ " 22650.0 | \n",
+ " 22880.0 | \n",
+ " 22590.0 | \n",
+ " 22646.0 | \n",
+ " 22646.0 | \n",
+ " 4.562569e+08 | \n",
+ "
\n",
+ " \n",
+ " 3073 | \n",
+ " 2022-08-05 | \n",
+ " 22677.0 | \n",
+ " 22732.0 | \n",
+ " 22500.0 | \n",
+ " 22587.0 | \n",
+ " 22587.0 | \n",
+ " 4.019544e+08 | \n",
+ "
\n",
+ " \n",
+ " 3074 | \n",
+ " 2022-08-08 | \n",
+ " 22722.0 | \n",
+ " 22828.0 | \n",
+ " 22560.0 | \n",
+ " 22728.0 | \n",
+ " 22728.0 | \n",
+ " 2.927303e+08 | \n",
+ "
\n",
+ " \n",
+ " 3075 | \n",
+ " 2022-08-09 | \n",
+ " 22728.0 | \n",
+ " 22771.0 | \n",
+ " 22470.0 | \n",
+ " 22488.0 | \n",
+ " 22488.0 | \n",
+ " 2.886964e+08 | \n",
+ "
\n",
+ " \n",
+ " 3076 | \n",
+ " 2022-08-10 | \n",
+ " 22419.0 | \n",
+ " 22733.0 | \n",
+ " 22354.0 | \n",
+ " 22702.0 | \n",
+ " 22702.0 | \n",
+ " 2.790439e+08 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
3077 rows × 7 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ftsemib.__get_training_set__()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 424
+ },
+ "id": "iTfjwNI9vmY_",
+ "outputId": "8025af8e-a6ff-44e0-c54a-1add644589aa"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Date Open High Low Close Adj Close Volume\n",
+ "0 2010-07-01 19000.0 19254.0 18807.0 18944.0 18944.0 1.003573e+09\n",
+ "1 2010-07-02 19114.0 19257.0 18927.0 19074.0 19074.0 7.714960e+08\n",
+ "2 2010-07-05 19127.0 19127.0 18843.0 18849.0 18849.0 4.103019e+08\n",
+ "3 2010-07-06 18945.0 19577.0 18904.0 19357.0 19357.0 8.490529e+08\n",
+ "4 2010-07-07 19174.0 20013.0 19095.0 20013.0 20013.0 1.102459e+09\n",
+ "... ... ... ... ... ... ... ...\n",
+ "2148 2018-12-11 18553.0 18768.0 18422.0 18591.0 18591.0 3.839444e+08\n",
+ "2149 2018-12-12 18667.0 19021.0 18627.0 18946.0 18946.0 4.417947e+08\n",
+ "2150 2018-12-13 19094.0 19210.0 18956.0 19049.0 19049.0 4.399755e+08\n",
+ "2151 2018-12-14 18875.0 18968.0 18734.0 18911.0 18911.0 3.119417e+08\n",
+ "2152 2018-12-17 18901.0 18914.0 18655.0 18693.0 18693.0 3.203914e+08\n",
+ "\n",
+ "[2153 rows x 7 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Date | \n",
+ " Open | \n",
+ " High | \n",
+ " Low | \n",
+ " Close | \n",
+ " Adj Close | \n",
+ " Volume | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2010-07-01 | \n",
+ " 19000.0 | \n",
+ " 19254.0 | \n",
+ " 18807.0 | \n",
+ " 18944.0 | \n",
+ " 18944.0 | \n",
+ " 1.003573e+09 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2010-07-02 | \n",
+ " 19114.0 | \n",
+ " 19257.0 | \n",
+ " 18927.0 | \n",
+ " 19074.0 | \n",
+ " 19074.0 | \n",
+ " 7.714960e+08 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2010-07-05 | \n",
+ " 19127.0 | \n",
+ " 19127.0 | \n",
+ " 18843.0 | \n",
+ " 18849.0 | \n",
+ " 18849.0 | \n",
+ " 4.103019e+08 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2010-07-06 | \n",
+ " 18945.0 | \n",
+ " 19577.0 | \n",
+ " 18904.0 | \n",
+ " 19357.0 | \n",
+ " 19357.0 | \n",
+ " 8.490529e+08 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2010-07-07 | \n",
+ " 19174.0 | \n",
+ " 20013.0 | \n",
+ " 19095.0 | \n",
+ " 20013.0 | \n",
+ " 20013.0 | \n",
+ " 1.102459e+09 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2148 | \n",
+ " 2018-12-11 | \n",
+ " 18553.0 | \n",
+ " 18768.0 | \n",
+ " 18422.0 | \n",
+ " 18591.0 | \n",
+ " 18591.0 | \n",
+ " 3.839444e+08 | \n",
+ "
\n",
+ " \n",
+ " 2149 | \n",
+ " 2018-12-12 | \n",
+ " 18667.0 | \n",
+ " 19021.0 | \n",
+ " 18627.0 | \n",
+ " 18946.0 | \n",
+ " 18946.0 | \n",
+ " 4.417947e+08 | \n",
+ "
\n",
+ " \n",
+ " 2150 | \n",
+ " 2018-12-13 | \n",
+ " 19094.0 | \n",
+ " 19210.0 | \n",
+ " 18956.0 | \n",
+ " 19049.0 | \n",
+ " 19049.0 | \n",
+ " 4.399755e+08 | \n",
+ "
\n",
+ " \n",
+ " 2151 | \n",
+ " 2018-12-14 | \n",
+ " 18875.0 | \n",
+ " 18968.0 | \n",
+ " 18734.0 | \n",
+ " 18911.0 | \n",
+ " 18911.0 | \n",
+ " 3.119417e+08 | \n",
+ "
\n",
+ " \n",
+ " 2152 | \n",
+ " 2018-12-17 | \n",
+ " 18901.0 | \n",
+ " 18914.0 | \n",
+ " 18655.0 | \n",
+ " 18693.0 | \n",
+ " 18693.0 | \n",
+ " 3.203914e+08 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2153 rows × 7 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ftsemib.fit_and_test(days_forward = 1)\n",
+ "ftsemib.plot_results()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 350
+ },
+ "id": "7dy3SzIsgOhz",
+ "outputId": "9b39fcb9-b041-4638-d045-1fad8d24411f"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "