In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tqdm import tqdm
import tensorflow.keras.backend as K
import os
import time
import pandas as pd
import numpy as np
import psutil
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
gpu_devices = tf.config.list_physical_devices('GPU')

if gpu_devices:
    print('Using GPU')
    for gpu in gpu_devices:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print('Using CPU')
    tf.config.optimizer.set_jit(True) 

Using CPU


In [3]:
import psutil
print('used: {}% free: {:.2f}GB'.format(psutil.virtual_memory().percent, float(psutil.virtual_memory().free)/1024**3))

used: 91.7% free: 0.62GB


In [4]:
from pathlib import Path
DATA_STORE = Path('model_data.h5')

In [None]:
# Get News headlines
def get_news():
    import h5py
    import s3fs
    data = pd.DataFrame()
    s3 = s3fs.S3FileSystem(anon=False, key='AKIAVKQQJHFPQ35IN6F6', 
                           secret='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',
                           client_kwargs={'region_name':'us-west-2'})
    with h5py.File(s3.open("charlanguagemodeldata/news_db.h5", 'rb'), 'r', lib_version='latest') as f:
        tickers = list(f.keys())
        for i in tqdm(tickers):
            temp_df = pd.DataFrame(f[i+'/table'].value)# took one day to realize
            temp_df = temp_df[['versionCreated', 'text', 'ticker']]
            temp_df = temp_df.rename(columns={'versionCreated':'time', 'text': 'headline'})
            temp_df[temp_df.columns[temp_df.dtypes == object]] = temp_df.select_dtypes([object]).stack().str.decode('utf-8').unstack()
            temp_df.time = pd.to_datetime(temp_df.time, unit='ns')
            temp_df = temp_df.set_index('time')
            print()
            data = data.append(temp_df)
        data.index = data.index.strftime('%Y-%m-%d %H:%M:%S.%fZ') #get full time to drop duplicates
        data.index = pd.to_datetime(data.index)
        data = data.sort_index()
        data = data.reset_index().set_index(['ticker', 'time']) #drop identical(ticker-datetime)
        data = data[~data.headline.duplicated()].reset_index().set_index('time')
        data.index = data.index.strftime('%Y-%m-%d %H:%M:%S')
        data.index = pd.to_datetime(data.index)
    f.close()
    return data 

def get_prices(interval):
    import h5py
    import s3fs
    idx = pd.IndexSlice
    data = pd.DataFrame()
    s3 = s3fs.S3FileSystem(anon=False, key='AKIAVKQQJHFPQ35IN6F6', 
                           secret='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',
                           client_kwargs={'region_name':'us-west-2'})
    with h5py.File(s3.open("charlanguagemodeldata/universe.h5", 'rb'), 'r', lib_version='latest') as f:
        print(list(f.keys()))
        print(list(f.get('prices').keys())) 
        get_columns =['time', 'ticker', 'Open', 'Close']
        data = pd.DataFrame(f['prices/'+interval+'/table'].value,columns=get_columns)
        data[data.columns[data.dtypes == object]] = data.select_dtypes([object]).stack().str.decode('utf-8').unstack()
        data.time = pd.to_datetime(data.time, unit='ns')
        data = data.set_index(['ticker', 'time'])
        data = (data.sort_index(level = 0,sort_remaining=0)
                                    .loc[idx[:, '2019':], :]
                                    .sort_index())
        data = data[~data.index.duplicated()] #while scraping data,if duplicates are stored
    f.close()
    return data

In [6]:
news = get_news()
news.head().append(news.tail())

  0%|          | 1/532 [00:06<58:06,  6.57s/it]




  0%|          | 2/532 [00:17<1:09:03,  7.82s/it]




  1%|          | 3/532 [00:21<59:21,  6.73s/it]  




  1%|          | 4/532 [00:37<1:23:04,  9.44s/it]




  1%|          | 5/532 [00:59<1:56:10, 13.23s/it]




  1%|          | 6/532 [01:05<1:36:14, 10.98s/it]




  1%|▏         | 7/532 [01:06<1:10:13,  8.03s/it]




  2%|▏         | 8/532 [01:14<1:09:36,  7.97s/it]




  2%|▏         | 9/532 [01:21<1:08:42,  7.88s/it]




  2%|▏         | 10/532 [01:25<59:03,  6.79s/it] 




  2%|▏         | 11/532 [01:30<52:47,  6.08s/it]




  2%|▏         | 12/532 [01:35<49:49,  5.75s/it]




  2%|▏         | 13/532 [01:39<45:09,  5.22s/it]




  3%|▎         | 14/532 [01:43<42:05,  4.87s/it]




  3%|▎         | 15/532 [01:47<39:14,  4.55s/it]




  3%|▎         | 16/532 [01:53<43:24,  5.05s/it]




  3%|▎         | 17/532 [02:00<47:24,  5.52s/it]




  3%|▎         | 18/532 [02:04<44:41,  5.22s/it]




  4%|▎         | 19/532 [02:13<54:05,  6.33s/it]




  4%|▍         | 20/532 [02:17<47:15,  5.54s/it]




  4%|▍         | 21/532 [02:19<38:17,  4.50s/it]




  4%|▍         | 22/532 [02:24<39:00,  4.59s/it]




  4%|▍         | 23/532 [02:31<46:44,  5.51s/it]




  5%|▍         | 24/532 [02:48<1:16:17,  9.01s/it]




  5%|▍         | 25/532 [02:53<1:04:26,  7.63s/it]




  5%|▍         | 26/532 [03:00<1:03:33,  7.54s/it]




  5%|▌         | 27/532 [03:05<56:19,  6.69s/it]  




  5%|▌         | 28/532 [03:07<44:48,  5.33s/it]




  5%|▌         | 29/532 [03:11<41:46,  4.98s/it]




  6%|▌         | 30/532 [03:15<38:32,  4.61s/it]




  6%|▌         | 31/532 [03:19<36:42,  4.40s/it]




  6%|▌         | 32/532 [03:25<40:31,  4.86s/it]




  6%|▌         | 33/532 [03:28<35:27,  4.26s/it]




  6%|▋         | 34/532 [03:35<43:01,  5.18s/it]




  7%|▋         | 35/532 [03:39<40:37,  4.90s/it]




  7%|▋         | 36/532 [03:43<37:53,  4.58s/it]




  7%|▋         | 37/532 [03:59<1:04:53,  7.86s/it]




  7%|▋         | 38/532 [04:03<55:13,  6.71s/it]  




  7%|▋         | 39/532 [04:06<47:04,  5.73s/it]




  8%|▊         | 40/532 [04:11<45:00,  5.49s/it]




  8%|▊         | 41/532 [04:18<49:07,  6.00s/it]




  8%|▊         | 42/532 [04:23<45:21,  5.55s/it]




  8%|▊         | 43/532 [04:29<48:04,  5.90s/it]




  8%|▊         | 44/532 [04:35<48:24,  5.95s/it]




  8%|▊         | 45/532 [04:39<42:24,  5.22s/it]




  9%|▊         | 46/532 [04:43<40:34,  5.01s/it]




  9%|▉         | 47/532 [04:47<36:52,  4.56s/it]




  9%|▉         | 48/532 [04:52<37:13,  4.62s/it]




  9%|▉         | 49/532 [04:55<33:00,  4.10s/it]




  9%|▉         | 50/532 [05:00<35:40,  4.44s/it]




 10%|▉         | 51/532 [05:02<31:19,  3.91s/it]




 10%|▉         | 52/532 [05:09<36:50,  4.61s/it]




 10%|▉         | 53/532 [05:12<32:29,  4.07s/it]




 10%|█         | 54/532 [05:16<34:04,  4.28s/it]




 10%|█         | 55/532 [05:22<38:10,  4.80s/it]




 11%|█         | 56/532 [05:23<28:53,  3.64s/it]




 11%|█         | 57/532 [05:37<52:32,  6.64s/it]




 11%|█         | 58/532 [05:47<1:00:14,  7.63s/it]




 11%|█         | 59/532 [05:52<54:42,  6.94s/it]  




 11%|█▏        | 60/532 [05:59<53:43,  6.83s/it]




 11%|█▏        | 61/532 [06:05<51:47,  6.60s/it]




 12%|█▏        | 62/532 [06:13<55:27,  7.08s/it]




 12%|█▏        | 63/532 [06:16<44:44,  5.72s/it]




 12%|█▏        | 64/532 [06:22<46:23,  5.95s/it]




 12%|█▏        | 65/532 [06:28<46:22,  5.96s/it]




 12%|█▏        | 66/532 [06:30<38:03,  4.90s/it]




 13%|█▎        | 67/532 [06:41<51:52,  6.69s/it]




 13%|█▎        | 68/532 [06:46<47:26,  6.13s/it]




 13%|█▎        | 69/532 [06:55<53:36,  6.95s/it]




 13%|█▎        | 70/532 [07:07<1:04:47,  8.42s/it]




 13%|█▎        | 71/532 [07:11<55:30,  7.22s/it]  




 14%|█▎        | 72/532 [07:16<48:31,  6.33s/it]




 14%|█▎        | 73/532 [07:23<50:50,  6.65s/it]




 14%|█▍        | 74/532 [07:29<50:08,  6.57s/it]




 14%|█▍        | 75/532 [07:37<52:16,  6.86s/it]




 14%|█▍        | 76/532 [07:43<50:01,  6.58s/it]




 14%|█▍        | 77/532 [07:47<45:28,  6.00s/it]




 15%|█▍        | 78/532 [07:52<43:02,  5.69s/it]




 15%|█▍        | 79/532 [08:03<53:52,  7.14s/it]




 15%|█▌        | 80/532 [08:08<50:18,  6.68s/it]




 15%|█▌        | 81/532 [08:14<46:33,  6.19s/it]




 15%|█▌        | 82/532 [08:15<36:09,  4.82s/it]




 16%|█▌        | 83/532 [08:24<44:17,  5.92s/it]




 16%|█▌        | 84/532 [08:29<41:57,  5.62s/it]




 16%|█▌        | 85/532 [08:36<46:51,  6.29s/it]




 16%|█▌        | 86/532 [08:42<44:42,  6.01s/it]




 16%|█▋        | 87/532 [08:47<41:48,  5.64s/it]




 17%|█▋        | 88/532 [08:55<48:17,  6.53s/it]




 17%|█▋        | 89/532 [08:59<43:02,  5.83s/it]




 17%|█▋        | 90/532 [09:02<36:37,  4.97s/it]




 17%|█▋        | 91/532 [09:06<34:35,  4.71s/it]




 17%|█▋        | 92/532 [09:10<31:03,  4.24s/it]




 17%|█▋        | 93/532 [09:13<28:43,  3.93s/it]




 18%|█▊        | 94/532 [09:18<31:13,  4.28s/it]




 18%|█▊        | 95/532 [09:23<32:16,  4.43s/it]




 18%|█▊        | 96/532 [09:26<29:34,  4.07s/it]




 18%|█▊        | 97/532 [09:28<26:06,  3.60s/it]




 18%|█▊        | 98/532 [09:33<27:38,  3.82s/it]




 19%|█▊        | 99/532 [09:38<31:22,  4.35s/it]




 19%|█▉        | 100/532 [09:41<27:18,  3.79s/it]




 19%|█▉        | 101/532 [09:46<29:46,  4.14s/it]




 19%|█▉        | 102/532 [09:50<30:07,  4.20s/it]




 19%|█▉        | 103/532 [09:55<31:39,  4.43s/it]




 20%|█▉        | 104/532 [10:03<40:09,  5.63s/it]




 20%|█▉        | 105/532 [10:09<40:23,  5.67s/it]




 20%|█▉        | 106/532 [10:15<39:45,  5.60s/it]




 20%|██        | 107/532 [10:23<45:25,  6.41s/it]




 20%|██        | 108/532 [10:28<41:50,  5.92s/it]




 20%|██        | 109/532 [10:34<42:37,  6.05s/it]




 21%|██        | 110/532 [10:41<43:41,  6.21s/it]




 21%|██        | 111/532 [10:47<44:16,  6.31s/it]




 21%|██        | 112/532 [10:52<41:37,  5.95s/it]




 21%|██        | 113/532 [10:53<30:41,  4.39s/it]




 21%|██▏       | 114/532 [11:02<39:53,  5.73s/it]




 22%|██▏       | 115/532 [11:09<42:32,  6.12s/it]




 22%|██▏       | 116/532 [11:10<32:31,  4.69s/it]




 22%|██▏       | 117/532 [11:15<32:37,  4.72s/it]




 22%|██▏       | 118/532 [11:16<23:34,  3.42s/it]




 22%|██▏       | 119/532 [11:24<33:38,  4.89s/it]




 23%|██▎       | 120/532 [11:35<45:58,  6.70s/it]




 23%|██▎       | 121/532 [11:39<40:29,  5.91s/it]




 23%|██▎       | 122/532 [11:42<34:12,  5.01s/it]




 23%|██▎       | 123/532 [11:43<25:55,  3.80s/it]




 23%|██▎       | 124/532 [11:47<27:08,  3.99s/it]




 23%|██▎       | 125/532 [11:54<33:26,  4.93s/it]




 24%|██▎       | 126/532 [11:59<33:21,  4.93s/it]




 24%|██▍       | 127/532 [12:04<32:59,  4.89s/it]




 24%|██▍       | 128/532 [12:14<42:24,  6.30s/it]




 24%|██▍       | 129/532 [12:24<49:46,  7.41s/it]




 24%|██▍       | 130/532 [12:28<42:57,  6.41s/it]




 25%|██▍       | 131/532 [12:36<46:47,  7.00s/it]




 25%|██▍       | 132/532 [12:47<54:01,  8.10s/it]




 25%|██▌       | 133/532 [12:53<50:07,  7.54s/it]




 25%|██▌       | 134/532 [12:58<45:18,  6.83s/it]




 25%|██▌       | 135/532 [13:02<40:15,  6.09s/it]




 26%|██▌       | 136/532 [13:07<36:17,  5.50s/it]




 26%|██▌       | 137/532 [13:12<36:30,  5.54s/it]




 26%|██▌       | 138/532 [13:17<35:34,  5.42s/it]




 26%|██▌       | 139/532 [13:24<38:49,  5.93s/it]




 26%|██▋       | 140/532 [13:34<46:20,  7.09s/it]




 27%|██▋       | 141/532 [13:38<39:40,  6.09s/it]




 27%|██▋       | 142/532 [13:42<35:33,  5.47s/it]




 27%|██▋       | 143/532 [13:48<36:44,  5.67s/it]




 27%|██▋       | 144/532 [13:53<35:55,  5.55s/it]




 27%|██▋       | 145/532 [13:58<34:46,  5.39s/it]




 27%|██▋       | 146/532 [14:02<31:08,  4.84s/it]




 28%|██▊       | 147/532 [14:07<30:32,  4.76s/it]




 28%|██▊       | 148/532 [14:14<35:34,  5.56s/it]




 28%|██▊       | 149/532 [14:21<38:08,  5.98s/it]




 28%|██▊       | 150/532 [14:25<33:34,  5.27s/it]




 28%|██▊       | 151/532 [14:29<31:36,  4.98s/it]




 29%|██▊       | 152/532 [14:34<32:30,  5.13s/it]




 29%|██▉       | 153/532 [14:42<37:09,  5.88s/it]




 29%|██▉       | 154/532 [14:47<35:00,  5.56s/it]




 29%|██▉       | 155/532 [14:53<36:40,  5.84s/it]




 29%|██▉       | 156/532 [14:59<36:09,  5.77s/it]




 30%|██▉       | 157/532 [15:03<31:58,  5.12s/it]




 30%|██▉       | 158/532 [15:07<30:51,  4.95s/it]




 30%|██▉       | 159/532 [15:15<35:30,  5.71s/it]




 30%|███       | 160/532 [15:20<34:31,  5.57s/it]




 30%|███       | 161/532 [15:24<32:40,  5.28s/it]




 30%|███       | 162/532 [15:29<30:48,  5.00s/it]




 31%|███       | 163/532 [15:33<28:48,  4.68s/it]




 31%|███       | 164/532 [15:37<28:31,  4.65s/it]




 31%|███       | 165/532 [15:41<26:43,  4.37s/it]




 31%|███       | 166/532 [15:45<26:29,  4.34s/it]




 31%|███▏      | 167/532 [15:50<26:45,  4.40s/it]




 32%|███▏      | 168/532 [15:55<27:37,  4.55s/it]




 32%|███▏      | 169/532 [15:57<24:12,  4.00s/it]




 32%|███▏      | 170/532 [16:01<22:31,  3.73s/it]




 32%|███▏      | 171/532 [16:03<20:29,  3.41s/it]




 32%|███▏      | 172/532 [16:06<19:53,  3.32s/it]




 33%|███▎      | 173/532 [16:10<20:57,  3.50s/it]




 33%|███▎      | 174/532 [16:17<26:14,  4.40s/it]




 33%|███▎      | 175/532 [16:19<22:23,  3.76s/it]




 33%|███▎      | 176/532 [16:24<25:05,  4.23s/it]




 33%|███▎      | 177/532 [16:29<26:18,  4.45s/it]




 33%|███▎      | 178/532 [16:36<29:26,  4.99s/it]




 34%|███▎      | 179/532 [16:36<21:13,  3.61s/it]




 34%|███▍      | 180/532 [16:42<24:59,  4.26s/it]




 34%|███▍      | 181/532 [16:44<21:20,  3.65s/it]




 34%|███▍      | 182/532 [16:55<33:45,  5.79s/it]




 34%|███▍      | 183/532 [17:00<32:36,  5.60s/it]




 35%|███▍      | 184/532 [17:02<26:18,  4.54s/it]




 35%|███▍      | 185/532 [17:15<40:14,  6.96s/it]




 35%|███▍      | 186/532 [17:18<33:53,  5.88s/it]




 35%|███▌      | 187/532 [17:25<35:29,  6.17s/it]




 35%|███▌      | 188/532 [17:33<38:56,  6.79s/it]




 36%|███▌      | 189/532 [17:38<36:32,  6.39s/it]




 36%|███▌      | 190/532 [17:43<32:37,  5.72s/it]




 36%|███▌      | 191/532 [17:48<32:34,  5.73s/it]




 36%|███▌      | 192/532 [17:54<31:38,  5.58s/it]




 36%|███▋      | 193/532 [17:58<30:21,  5.37s/it]




 36%|███▋      | 194/532 [18:02<27:25,  4.87s/it]




 37%|███▋      | 195/532 [18:06<25:50,  4.60s/it]




 37%|███▋      | 196/532 [18:09<22:55,  4.09s/it]




 37%|███▋      | 197/532 [18:13<22:30,  4.03s/it]




 37%|███▋      | 198/532 [18:18<23:48,  4.28s/it]




 37%|███▋      | 199/532 [18:22<23:57,  4.32s/it]




 38%|███▊      | 200/532 [18:26<22:59,  4.15s/it]




 38%|███▊      | 201/532 [18:28<19:17,  3.50s/it]




 38%|███▊      | 202/532 [18:35<24:50,  4.52s/it]




 38%|███▊      | 203/532 [18:40<25:10,  4.59s/it]




 38%|███▊      | 204/532 [18:43<23:44,  4.34s/it]




 39%|███▊      | 205/532 [18:50<26:59,  4.95s/it]




 39%|███▊      | 206/532 [18:59<34:15,  6.31s/it]




 39%|███▉      | 207/532 [19:08<38:34,  7.12s/it]




 39%|███▉      | 208/532 [19:14<35:49,  6.64s/it]




 39%|███▉      | 209/532 [19:16<29:12,  5.43s/it]




 39%|███▉      | 210/532 [19:21<28:07,  5.24s/it]




 40%|███▉      | 211/532 [19:30<34:04,  6.37s/it]




 40%|███▉      | 212/532 [19:42<42:05,  7.89s/it]




 40%|████      | 213/532 [19:53<47:07,  8.86s/it]




 40%|████      | 214/532 [19:56<37:33,  7.09s/it]




 40%|████      | 215/532 [20:00<32:45,  6.20s/it]




 41%|████      | 216/532 [20:07<33:52,  6.43s/it]




 41%|████      | 217/532 [20:11<29:56,  5.70s/it]




 41%|████      | 218/532 [20:21<36:55,  7.06s/it]




 41%|████      | 219/532 [20:23<28:49,  5.53s/it]




 41%|████▏     | 220/532 [20:30<30:52,  5.94s/it]




 42%|████▏     | 221/532 [20:34<28:06,  5.42s/it]




 42%|████▏     | 222/532 [20:37<24:23,  4.72s/it]




 42%|████▏     | 223/532 [20:42<23:54,  4.64s/it]




 42%|████▏     | 224/532 [20:45<22:11,  4.32s/it]




 42%|████▏     | 225/532 [20:52<25:37,  5.01s/it]




 42%|████▏     | 226/532 [20:57<25:14,  4.95s/it]




 43%|████▎     | 227/532 [21:01<24:37,  4.84s/it]




 43%|████▎     | 228/532 [21:06<24:37,  4.86s/it]




 43%|████▎     | 229/532 [21:12<25:32,  5.06s/it]




 43%|████▎     | 230/532 [21:17<25:49,  5.13s/it]




 43%|████▎     | 231/532 [21:20<22:34,  4.50s/it]




 44%|████▎     | 232/532 [21:28<27:30,  5.50s/it]




 44%|████▍     | 233/532 [21:34<28:23,  5.70s/it]




 44%|████▍     | 234/532 [21:44<34:46,  7.00s/it]




 44%|████▍     | 235/532 [21:45<26:04,  5.27s/it]




 44%|████▍     | 236/532 [21:50<24:39,  5.00s/it]




 45%|████▍     | 237/532 [21:53<21:51,  4.45s/it]




 45%|████▍     | 238/532 [21:56<20:40,  4.22s/it]




 45%|████▍     | 239/532 [22:00<20:18,  4.16s/it]




 45%|████▌     | 240/532 [22:06<22:37,  4.65s/it]




 45%|████▌     | 241/532 [22:12<24:24,  5.03s/it]




 45%|████▌     | 242/532 [22:22<31:18,  6.48s/it]




 46%|████▌     | 243/532 [22:30<33:55,  7.04s/it]




 46%|████▌     | 244/532 [22:32<26:05,  5.43s/it]




 46%|████▌     | 245/532 [22:36<23:18,  4.87s/it]




 46%|████▌     | 246/532 [22:42<25:13,  5.29s/it]




 46%|████▋     | 247/532 [22:49<27:06,  5.71s/it]




 47%|████▋     | 248/532 [22:53<25:46,  5.44s/it]




 47%|████▋     | 249/532 [23:01<29:23,  6.23s/it]




 47%|████▋     | 250/532 [23:12<35:07,  7.48s/it]




 47%|████▋     | 251/532 [23:17<31:53,  6.81s/it]




 47%|████▋     | 252/532 [23:20<26:33,  5.69s/it]




 48%|████▊     | 253/532 [23:24<23:49,  5.12s/it]




 48%|████▊     | 254/532 [23:26<19:18,  4.17s/it]




 48%|████▊     | 255/532 [23:29<17:37,  3.82s/it]




 48%|████▊     | 256/532 [23:34<19:48,  4.31s/it]




 48%|████▊     | 257/532 [23:38<18:34,  4.05s/it]




 48%|████▊     | 258/532 [23:40<15:46,  3.46s/it]




 49%|████▊     | 259/532 [23:44<16:51,  3.71s/it]




 49%|████▉     | 260/532 [23:48<17:26,  3.85s/it]




 49%|████▉     | 261/532 [23:58<25:23,  5.62s/it]




 49%|████▉     | 262/532 [24:02<23:12,  5.16s/it]




 49%|████▉     | 263/532 [24:05<20:17,  4.53s/it]




 50%|████▉     | 264/532 [24:10<20:25,  4.57s/it]




 50%|████▉     | 265/532 [24:16<22:43,  5.11s/it]




 50%|█████     | 266/532 [24:19<19:35,  4.42s/it]




 50%|█████     | 267/532 [24:28<25:18,  5.73s/it]




 50%|█████     | 268/532 [24:33<24:03,  5.47s/it]




 51%|█████     | 269/532 [24:44<30:59,  7.07s/it]




 51%|█████     | 270/532 [24:49<29:24,  6.74s/it]




 51%|█████     | 271/532 [24:56<28:53,  6.64s/it]




 51%|█████     | 272/532 [25:02<28:05,  6.48s/it]




 51%|█████▏    | 273/532 [25:10<29:37,  6.86s/it]




 52%|█████▏    | 274/532 [25:14<26:32,  6.17s/it]




 52%|█████▏    | 275/532 [25:18<23:01,  5.38s/it]




 52%|█████▏    | 276/532 [25:23<22:23,  5.25s/it]




 52%|█████▏    | 277/532 [25:33<28:33,  6.72s/it]




 52%|█████▏    | 278/532 [25:34<20:47,  4.91s/it]




 52%|█████▏    | 279/532 [25:41<24:17,  5.76s/it]




 53%|█████▎    | 280/532 [25:48<25:53,  6.16s/it]




 53%|█████▎    | 281/532 [25:49<19:17,  4.61s/it]




 53%|█████▎    | 282/532 [25:53<18:28,  4.43s/it]




 53%|█████▎    | 283/532 [25:59<19:42,  4.75s/it]




 53%|█████▎    | 284/532 [26:05<20:39,  5.00s/it]




 54%|█████▎    | 285/532 [26:11<21:52,  5.31s/it]




 54%|█████▍    | 286/532 [26:16<22:23,  5.46s/it]




 54%|█████▍    | 287/532 [26:22<22:02,  5.40s/it]




 54%|█████▍    | 288/532 [26:26<20:47,  5.11s/it]




 54%|█████▍    | 289/532 [26:31<20:36,  5.09s/it]




 55%|█████▍    | 290/532 [26:36<19:58,  4.95s/it]




 55%|█████▍    | 291/532 [26:40<19:10,  4.77s/it]




 55%|█████▍    | 292/532 [26:48<23:09,  5.79s/it]




 55%|█████▌    | 293/532 [26:50<18:10,  4.56s/it]




 55%|█████▌    | 294/532 [26:58<22:25,  5.65s/it]




 55%|█████▌    | 295/532 [27:07<26:24,  6.68s/it]




 56%|█████▌    | 296/532 [27:13<25:08,  6.39s/it]




 56%|█████▌    | 297/532 [27:16<21:23,  5.46s/it]




 56%|█████▌    | 298/532 [27:22<22:02,  5.65s/it]




 56%|█████▌    | 299/532 [27:27<20:12,  5.21s/it]




 56%|█████▋    | 300/532 [27:31<19:32,  5.05s/it]




 57%|█████▋    | 301/532 [27:38<21:31,  5.59s/it]




 57%|█████▋    | 302/532 [27:46<24:20,  6.35s/it]




 57%|█████▋    | 303/532 [27:52<23:13,  6.08s/it]




 57%|█████▋    | 304/532 [27:54<19:15,  5.07s/it]




 57%|█████▋    | 305/532 [28:01<20:27,  5.41s/it]




 58%|█████▊    | 306/532 [28:05<19:20,  5.14s/it]




 58%|█████▊    | 307/532 [28:14<23:52,  6.37s/it]




 58%|█████▊    | 308/532 [28:17<19:17,  5.17s/it]




 58%|█████▊    | 309/532 [28:25<22:41,  6.10s/it]




 58%|█████▊    | 310/532 [28:29<20:25,  5.52s/it]




 58%|█████▊    | 311/532 [28:37<23:22,  6.35s/it]




 59%|█████▊    | 312/532 [28:43<22:30,  6.14s/it]




 59%|█████▉    | 313/532 [28:48<21:23,  5.86s/it]




 59%|█████▉    | 314/532 [29:00<27:33,  7.58s/it]




 59%|█████▉    | 315/532 [29:05<24:27,  6.76s/it]




 59%|█████▉    | 316/532 [29:14<26:36,  7.39s/it]




 60%|█████▉    | 317/532 [29:15<20:35,  5.75s/it]




 60%|█████▉    | 318/532 [29:23<22:29,  6.31s/it]




 60%|█████▉    | 319/532 [29:31<23:41,  6.67s/it]




 60%|██████    | 320/532 [29:33<18:44,  5.31s/it]




 60%|██████    | 321/532 [29:37<17:26,  4.96s/it]




 61%|██████    | 322/532 [29:40<15:14,  4.36s/it]




 61%|██████    | 323/532 [29:42<13:08,  3.77s/it]




 61%|██████    | 324/532 [29:47<14:17,  4.12s/it]




 61%|██████    | 325/532 [29:56<19:19,  5.60s/it]




 61%|██████▏   | 326/532 [29:59<16:13,  4.72s/it]




 61%|██████▏   | 327/532 [30:06<18:32,  5.43s/it]




 62%|██████▏   | 328/532 [30:11<17:36,  5.18s/it]




 62%|██████▏   | 329/532 [30:18<19:58,  5.90s/it]




 62%|██████▏   | 330/532 [30:27<22:46,  6.77s/it]




 62%|██████▏   | 331/532 [30:33<21:54,  6.54s/it]




 62%|██████▏   | 332/532 [30:38<19:58,  5.99s/it]




 63%|██████▎   | 333/532 [30:52<27:45,  8.37s/it]




 63%|██████▎   | 334/532 [31:00<28:07,  8.52s/it]




 63%|██████▎   | 335/532 [31:12<31:03,  9.46s/it]




 63%|██████▎   | 336/532 [31:19<28:36,  8.76s/it]




 63%|██████▎   | 337/532 [31:24<24:44,  7.61s/it]




 64%|██████▎   | 338/532 [31:28<21:04,  6.52s/it]




 64%|██████▎   | 339/532 [31:36<21:56,  6.82s/it]




 64%|██████▍   | 340/532 [31:39<18:44,  5.86s/it]




 64%|██████▍   | 341/532 [31:45<18:54,  5.94s/it]




 64%|██████▍   | 342/532 [31:51<18:20,  5.79s/it]




 64%|██████▍   | 343/532 [31:57<18:35,  5.90s/it]




 65%|██████▍   | 344/532 [32:06<21:29,  6.86s/it]




 65%|██████▍   | 345/532 [32:13<21:33,  6.92s/it]




 65%|██████▌   | 346/532 [32:22<23:14,  7.50s/it]




 65%|██████▌   | 347/532 [32:34<27:34,  8.94s/it]




 65%|██████▌   | 348/532 [32:38<22:33,  7.35s/it]




 66%|██████▌   | 349/532 [32:47<23:56,  7.85s/it]




 66%|██████▌   | 350/532 [32:51<20:18,  6.70s/it]




 66%|██████▌   | 351/532 [32:57<19:29,  6.46s/it]




 66%|██████▌   | 352/532 [33:05<20:56,  6.98s/it]




 66%|██████▋   | 353/532 [33:10<19:17,  6.46s/it]




 67%|██████▋   | 354/532 [33:19<21:05,  7.11s/it]




 67%|██████▋   | 355/532 [33:24<18:49,  6.38s/it]




 67%|██████▋   | 356/532 [33:29<17:27,  5.95s/it]




 67%|██████▋   | 357/532 [33:34<16:37,  5.70s/it]




 67%|██████▋   | 358/532 [33:37<14:17,  4.93s/it]




 67%|██████▋   | 359/532 [33:43<15:01,  5.21s/it]




 68%|██████▊   | 360/532 [33:47<14:27,  5.04s/it]




 68%|██████▊   | 361/532 [33:56<17:31,  6.15s/it]




 68%|██████▊   | 362/532 [33:57<12:51,  4.54s/it]




 68%|██████▊   | 363/532 [34:01<12:19,  4.37s/it]




 68%|██████▊   | 364/532 [34:09<15:03,  5.38s/it]




 69%|██████▊   | 365/532 [34:17<17:17,  6.21s/it]




 69%|██████▉   | 366/532 [34:22<15:58,  5.78s/it]




 69%|██████▉   | 367/532 [34:25<14:09,  5.15s/it]




 69%|██████▉   | 368/532 [34:27<11:02,  4.04s/it]




 69%|██████▉   | 369/532 [34:32<11:43,  4.32s/it]




 70%|██████▉   | 370/532 [34:36<11:22,  4.21s/it]




 70%|██████▉   | 371/532 [34:45<15:29,  5.77s/it]




 70%|██████▉   | 372/532 [34:49<13:46,  5.17s/it]




 70%|███████   | 373/532 [34:51<11:04,  4.18s/it]




 70%|███████   | 374/532 [34:59<14:22,  5.46s/it]




 70%|███████   | 375/532 [35:00<10:24,  3.98s/it]




 71%|███████   | 376/532 [35:03<09:31,  3.67s/it]




 71%|███████   | 377/532 [35:06<09:30,  3.68s/it]




 71%|███████   | 378/532 [35:10<09:39,  3.76s/it]




 71%|███████   | 379/532 [35:12<08:03,  3.16s/it]




 71%|███████▏  | 380/532 [35:16<08:28,  3.34s/it]




 72%|███████▏  | 381/532 [35:21<10:04,  4.00s/it]




 72%|███████▏  | 382/532 [35:30<13:20,  5.34s/it]




 72%|███████▏  | 383/532 [35:39<16:00,  6.45s/it]




 72%|███████▏  | 384/532 [35:42<13:21,  5.41s/it]




 72%|███████▏  | 385/532 [35:50<15:04,  6.15s/it]




 73%|███████▎  | 386/532 [35:55<14:10,  5.83s/it]




 73%|███████▎  | 387/532 [35:58<12:31,  5.18s/it]




 73%|███████▎  | 388/532 [36:03<11:44,  4.89s/it]




 73%|███████▎  | 389/532 [36:05<09:57,  4.18s/it]




 73%|███████▎  | 390/532 [36:08<09:17,  3.92s/it]




 73%|███████▎  | 391/532 [36:14<10:41,  4.55s/it]




 74%|███████▎  | 392/532 [36:21<11:58,  5.13s/it]




 74%|███████▍  | 393/532 [36:27<12:38,  5.46s/it]




 74%|███████▍  | 394/532 [36:30<10:39,  4.63s/it]




 74%|███████▍  | 395/532 [36:33<09:14,  4.05s/it]




 74%|███████▍  | 396/532 [36:34<07:07,  3.14s/it]




 75%|███████▍  | 397/532 [36:39<08:51,  3.94s/it]




 75%|███████▍  | 398/532 [36:44<09:04,  4.07s/it]




 75%|███████▌  | 399/532 [36:48<09:06,  4.11s/it]




 75%|███████▌  | 400/532 [36:54<10:10,  4.63s/it]




 75%|███████▌  | 401/532 [36:57<09:24,  4.31s/it]




 76%|███████▌  | 402/532 [37:06<12:24,  5.72s/it]




 76%|███████▌  | 403/532 [37:11<11:34,  5.38s/it]




 76%|███████▌  | 404/532 [37:14<10:03,  4.72s/it]




 76%|███████▌  | 405/532 [37:19<09:58,  4.71s/it]




 76%|███████▋  | 406/532 [37:27<11:55,  5.68s/it]




 77%|███████▋  | 407/532 [37:35<13:18,  6.39s/it]




 77%|███████▋  | 408/532 [37:40<12:09,  5.89s/it]




 77%|███████▋  | 409/532 [37:46<12:25,  6.06s/it]




 77%|███████▋  | 410/532 [37:48<09:52,  4.85s/it]




 77%|███████▋  | 411/532 [37:50<08:17,  4.11s/it]




 77%|███████▋  | 412/532 [37:57<10:00,  5.01s/it]




 78%|███████▊  | 413/532 [38:06<11:54,  6.01s/it]




 78%|███████▊  | 414/532 [38:09<10:03,  5.12s/it]




 78%|███████▊  | 415/532 [38:15<10:34,  5.43s/it]




 78%|███████▊  | 416/532 [38:19<09:50,  5.09s/it]




 78%|███████▊  | 417/532 [38:23<09:08,  4.77s/it]




 79%|███████▊  | 418/532 [38:28<09:02,  4.76s/it]




 79%|███████▉  | 419/532 [38:31<07:49,  4.15s/it]




 79%|███████▉  | 420/532 [38:35<07:48,  4.19s/it]




 79%|███████▉  | 421/532 [38:39<07:45,  4.19s/it]




 79%|███████▉  | 422/532 [38:43<07:41,  4.19s/it]




 80%|███████▉  | 423/532 [38:51<09:28,  5.21s/it]




 80%|███████▉  | 424/532 [38:53<07:44,  4.30s/it]




 80%|███████▉  | 425/532 [39:01<09:25,  5.29s/it]




 80%|████████  | 426/532 [39:08<10:12,  5.78s/it]




 80%|████████  | 427/532 [39:13<09:38,  5.51s/it]




 80%|████████  | 428/532 [39:17<09:01,  5.20s/it]




 81%|████████  | 429/532 [39:21<08:06,  4.73s/it]




 81%|████████  | 430/532 [39:25<08:01,  4.72s/it]




 81%|████████  | 431/532 [39:29<07:25,  4.42s/it]




 81%|████████  | 432/532 [39:33<07:10,  4.31s/it]




 81%|████████▏ | 433/532 [39:41<08:47,  5.33s/it]




 82%|████████▏ | 434/532 [39:44<07:30,  4.59s/it]




 82%|████████▏ | 435/532 [39:46<06:22,  3.94s/it]




 82%|████████▏ | 436/532 [39:50<06:26,  4.03s/it]




 82%|████████▏ | 437/532 [39:58<08:14,  5.21s/it]




 82%|████████▏ | 438/532 [40:02<07:37,  4.86s/it]




 83%|████████▎ | 439/532 [40:09<08:13,  5.31s/it]




 83%|████████▎ | 440/532 [40:14<08:11,  5.34s/it]




 83%|████████▎ | 441/532 [40:22<09:04,  5.99s/it]




 83%|████████▎ | 442/532 [40:24<07:12,  4.81s/it]




 83%|████████▎ | 443/532 [40:30<07:56,  5.35s/it]




 83%|████████▎ | 444/532 [40:34<06:56,  4.74s/it]




 84%|████████▎ | 445/532 [40:41<07:47,  5.38s/it]




 84%|████████▍ | 446/532 [40:47<08:12,  5.73s/it]




 84%|████████▍ | 447/532 [40:51<07:07,  5.03s/it]




 84%|████████▍ | 448/532 [40:55<06:59,  5.00s/it]




 84%|████████▍ | 449/532 [41:01<06:58,  5.05s/it]




 85%|████████▍ | 450/532 [41:05<06:44,  4.93s/it]




 85%|████████▍ | 451/532 [41:15<08:28,  6.27s/it]




 85%|████████▍ | 452/532 [41:21<08:29,  6.36s/it]




 85%|████████▌ | 453/532 [41:25<07:17,  5.54s/it]




 85%|████████▌ | 454/532 [41:29<06:49,  5.25s/it]




 86%|████████▌ | 455/532 [41:34<06:20,  4.94s/it]




 86%|████████▌ | 456/532 [41:38<06:06,  4.82s/it]




 86%|████████▌ | 457/532 [41:40<04:59,  4.00s/it]




 86%|████████▌ | 458/532 [41:47<05:52,  4.76s/it]




 86%|████████▋ | 459/532 [41:51<05:36,  4.61s/it]




 86%|████████▋ | 460/532 [41:59<06:45,  5.64s/it]




 87%|████████▋ | 461/532 [42:05<06:44,  5.70s/it]




 87%|████████▋ | 462/532 [42:10<06:15,  5.37s/it]




 87%|████████▋ | 463/532 [42:18<07:12,  6.26s/it]




 87%|████████▋ | 464/532 [42:26<07:46,  6.87s/it]




 87%|████████▋ | 465/532 [42:31<06:51,  6.14s/it]




 88%|████████▊ | 466/532 [42:37<06:44,  6.13s/it]




 88%|████████▊ | 467/532 [42:43<06:39,  6.15s/it]




 88%|████████▊ | 468/532 [42:47<05:49,  5.47s/it]




 88%|████████▊ | 469/532 [42:58<07:39,  7.30s/it]




 88%|████████▊ | 470/532 [43:06<07:32,  7.30s/it]





 89%|████████▊ | 472/532 [43:16<06:13,  6.23s/it]





 89%|████████▉ | 473/532 [43:29<08:00,  8.14s/it]




 89%|████████▉ | 475/532 [43:41<06:46,  7.13s/it]




 89%|████████▉ | 476/532 [43:45<05:41,  6.10s/it]




 90%|████████▉ | 477/532 [43:52<05:58,  6.52s/it]





 90%|█████████ | 479/532 [44:09<06:47,  7.70s/it]




 90%|█████████ | 480/532 [44:13<05:37,  6.50s/it]





 90%|█████████ | 481/532 [44:15<04:23,  5.17s/it]




 91%|█████████ | 483/532 [44:40<08:10, 10.02s/it]




 91%|█████████ | 484/532 [44:44<06:25,  8.03s/it]





 91%|█████████▏| 486/532 [44:59<06:07,  8.00s/it]





 92%|█████████▏| 487/532 [45:02<04:55,  6.57s/it]




 92%|█████████▏| 489/532 [45:19<05:21,  7.48s/it]





 92%|█████████▏| 491/532 [45:30<04:29,  6.58s/it]





 92%|█████████▏| 492/532 [45:38<04:41,  7.03s/it]




 93%|█████████▎| 493/532 [45:43<04:12,  6.47s/it]




 93%|█████████▎| 494/532 [45:45<03:20,  5.28s/it]




 93%|█████████▎| 495/532 [45:48<02:51,  4.63s/it]




 93%|█████████▎| 496/532 [45:49<02:01,  3.39s/it]




 93%|█████████▎| 497/532 [45:54<02:12,  3.79s/it]




 94%|█████████▍| 499/532 [46:02<02:14,  4.08s/it]




 94%|█████████▍| 500/532 [46:06<02:16,  4.26s/it]





 94%|█████████▍| 502/532 [46:21<02:43,  5.46s/it]




 95%|█████████▍| 503/532 [46:23<02:13,  4.59s/it]





 95%|█████████▍| 504/532 [46:30<02:26,  5.22s/it]




 95%|█████████▌| 506/532 [46:40<02:14,  5.16s/it]





 95%|█████████▌| 507/532 [46:44<02:00,  4.83s/it]




 95%|█████████▌| 508/532 [46:48<01:51,  4.65s/it]




 96%|█████████▌| 509/532 [46:59<02:31,  6.57s/it]




 96%|█████████▌| 511/532 [47:12<02:20,  6.67s/it]





 96%|█████████▌| 512/532 [47:17<01:59,  5.96s/it]




 96%|█████████▋| 513/532 [47:20<01:38,  5.20s/it]




 97%|█████████▋| 514/532 [47:30<01:58,  6.60s/it]




 97%|█████████▋| 515/532 [47:33<01:33,  5.52s/it]




 97%|█████████▋| 517/532 [47:38<01:00,  4.04s/it]





 97%|█████████▋| 518/532 [47:44<01:03,  4.55s/it]




 98%|█████████▊| 519/532 [47:48<00:55,  4.24s/it]




 98%|█████████▊| 521/532 [47:56<00:46,  4.20s/it]





 98%|█████████▊| 522/532 [48:00<00:40,  4.09s/it]




 98%|█████████▊| 523/532 [48:09<00:50,  5.61s/it]




 98%|█████████▊| 524/532 [48:13<00:40,  5.00s/it]




 99%|█████████▊| 525/532 [48:21<00:41,  5.95s/it]




 99%|█████████▉| 526/532 [48:24<00:30,  5.10s/it]




 99%|█████████▉| 527/532 [48:30<00:26,  5.25s/it]




 99%|█████████▉| 528/532 [48:34<00:19,  4.93s/it]




 99%|█████████▉| 529/532 [48:36<00:12,  4.03s/it]




100%|█████████▉| 530/532 [48:39<00:07,  3.76s/it]




100%|█████████▉| 531/532 [48:45<00:04,  4.50s/it]




100%|██████████| 532/532 [48:49<00:00,  5.51s/it]


Unnamed: 0_level_0,ticker,headline
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-07-08 07:20:18,CSCO,Dow Jones Selected Stocks 1710 - July 08
2019-07-08 19:50:01,CNP,NYSE ORDER IMBALANCE <CNP.N> 156400.0 SHARES O...
2019-07-08 23:16:50,CSCO,Dow Jones Selected Stocks - July 09
2019-07-09 06:01:05,AAPL,RCS - IMImobile PLC - Vauxhall launch Apple Bu...
2019-07-09 06:12:22,BLK,REG - iShares PLC - Net Asset Value(s)
2020-10-15 15:49:06,AAL,"BUZZ-U.S. STOCKS ON THE MOVE-Charles Schwab, W..."
2020-10-15 16:00:00,AMZN,Merkle Enters into Strategic Collaboration Agr...
2020-10-15 16:02:57,AEP,"CLASS ACTION UPDATE for AEP, FENC and BMRN: Le..."
2020-10-15 16:04:09,ADBE,Adobe (ADBE) Up 6.4% Since Last Earnings Repor...
2020-10-15 16:06:53,AAL,American Airlines Group Inc. - American Airlin...


In [7]:
news.info(verbose = 1, memory_usage = 1, null_counts= 1)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 568560 entries, 2019-07-08 07:20:18 to 2020-10-15 16:06:53
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ticker    568560 non-null  object
 1   headline  568560 non-null  object
dtypes: object(2)
memory usage: 13.0+ MB


In [8]:
# Resample News be Frequency
daily_news = news.copy().groupby('ticker').resample('D').agg({'headline': ' '.join})
daily_news['headline'] = daily_news['headline'].replace('', np.nan)
daily_news = daily_news.dropna()

hourly_news = news.copy().groupby('ticker').resample('H').agg({'headline': ' '.join})
hourly_news['headline'] = hourly_news['headline'].replace('', np.nan)
hourly_news = hourly_news.dropna()

min_15_news = news.copy().groupby('ticker').resample('15min').agg({'headline': ' '.join})
min_15_news['headline'] = min_15_news['headline'].replace('', np.nan)
min_15_news = min_15_news.dropna()

### Daily Data Processing

In [9]:
daily = get_prices('daily')
daily.info(verbose = 1, memory_usage = 1, null_counts= 1)

['model_data', 'prices']
['15_min', 'daily', 'hourly']
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 231075 entries, ('A', Timestamp('2019-01-02 00:00:00')) to ('ZTS', Timestamp('2020-10-14 00:00:00'))
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Open    231075 non-null  float64
 1   Close   231075 non-null  float64
dtypes: float64(2)
memory usage: 4.4+ MB


In [10]:
daily['returns'] = daily.Close.groupby('ticker').pct_change().shift(-1)
daily['label'] = daily.returns.where(daily.returns > 0, -1).where(daily.returns < 0, 1)

intersect = daily.index.intersection(daily_news.index)

daily_news = daily_news.loc[intersect, :].sort_index()
daily = daily.loc[intersect, :].sort_index()

daily = daily_news.join(daily[['Open', 'Close', 'returns', 'label']])


daily.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 111261 entries, ('A', Timestamp('2019-07-11 00:00:00')) to ('ZTS', Timestamp('2020-10-14 00:00:00'))
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   headline  111261 non-null  object 
 1   Open      111261 non-null  float64
 2   Close     111261 non-null  float64
 3   returns   110905 non-null  float64
 4   label     111261 non-null  float64
dtypes: float64(4), object(1)
memory usage: 9.7+ MB


In [11]:
daily = daily.dropna() #drop based on returns NaN
# daily['headline'] = daily.headline.apply(lambda x: '<s>' + x + '<\s') #parse start/end tokens
with pd.HDFStore(DATA_STORE) as store:
    store.put('model_data/daily', daily)

In [12]:
# Hourly Processing

In [13]:
hourly = get_prices('hourly')
hourly.info(verbose = 1, memory_usage = 1, null_counts= 1)

['model_data', 'prices']
['15_min', 'daily', 'hourly']
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1888760 entries, ('A', Timestamp('2019-01-02 14:00:00')) to ('ZTS', Timestamp('2020-10-14 20:00:00'))
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   Open    1888760 non-null  float64
 1   Close   1888760 non-null  float64
dtypes: float64(2)
memory usage: 36.3+ MB


In [14]:
# # last price after 1 hr compared to first price when news was released.
def o_c_pct_change(df):
    return (df.Close.shift(-1) - df.Open)/ df.Open

hourly['returns'] = hourly.groupby('ticker', group_keys=False).apply(o_c_pct_change)
hourly['label'] = hourly.returns.where(hourly.returns > 0, -1).where(hourly.returns < 0, 1)

hourly_intersect = hourly.index.intersection(hourly_news.index)

hourly_news = hourly_news.loc[hourly_intersect, :].sort_index()
hourly = hourly.loc[hourly_intersect, :].sort_index()

hourly = hourly_news.join(hourly[['Open', 'Close', 'returns', 'label']])

hourly['headline'] = hourly['headline'].replace('', np.nan)

hourly.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 180259 entries, ('A', Timestamp('2019-07-11 13:00:00')) to ('ZTS', Timestamp('2020-10-14 16:00:00'))
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   headline  180259 non-null  object 
 1   Open      180259 non-null  float64
 2   Close     180259 non-null  float64
 3   returns   180215 non-null  float64
 4   label     180259 non-null  float64
dtypes: float64(4), object(1)
memory usage: 12.6+ MB


In [15]:
hourly = hourly.dropna()
hourly = hourly[['headline', 'label']]
# hourly['headline'] = hourly.headline.apply(lambda x: '<s>' + x + '<\s') #parse start/end tokens
with pd.HDFStore(DATA_STORE) as store:
    store.put('model_data/hourly', hourly)

In [16]:
# 15 Min Data Process

In [17]:
min_15 = get_prices('15_min')
min_15.info(verbose = 1, memory_usage = 1, null_counts= 1)

['model_data', 'prices']
['15_min', 'daily', 'hourly']
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6362236 entries, ('A', Timestamp('2019-01-02 14:30:00')) to ('ZTS', Timestamp('2020-10-14 20:00:00'))
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   Open    6362236 non-null  float64
 1   Close   6362236 non-null  float64
dtypes: float64(2)
memory usage: 134.3+ MB


In [18]:
min_15['returns'] = min_15.groupby('ticker', group_keys=False).apply(o_c_pct_change)
min_15['label'] = min_15.returns.where(min_15.returns > 0, -1).where(min_15.returns < 0, 1)

min15_intersect = min_15.index.intersection(min_15_news.index)

min_15_news = min_15_news.loc[min15_intersect, :].sort_index()
min_15 = min_15.loc[min15_intersect, :].sort_index()

min_15 = min_15_news.join(min_15[['Open', 'Close','returns', 'label']])

min_15['headline'] = min_15['headline'].replace('', np.nan)
min_15.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 184156 entries, ('A', Timestamp('2019-07-11 16:30:00')) to ('ZTS', Timestamp('2020-10-14 16:45:00'))
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   headline  184156 non-null  object 
 1   Open      184156 non-null  float64
 2   Close     184156 non-null  float64
 3   returns   184146 non-null  float64
 4   label     184156 non-null  float64
dtypes: float64(4), object(1)
memory usage: 12.8+ MB


In [19]:
min_15 = min_15.dropna()
min_15 = min_15[['headline', 'label']]
# min_15['headline'] = min_15.headline.apply(lambda x: '<s>' + x + '<\s') #parse start/end tokens
with pd.HDFStore(DATA_STORE) as store:
    store.put('model_data/15_min', min_15)


In [20]:
with pd.HDFStore(DATA_STORE) as store:
    print(store.keys())

['/model_data/15_min', '/model_data/daily', '/model_data/hourly']
