Project Setup¶
import pickle
from pathlib import Path
from collections import deque
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import xgboost as xgb
import ccxt
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
After importing lets create a watermark
%load_ext watermark
%watermark -v -m -p ccxt,matplotlib,pandas,numpy,statsmodels,sklearn,xgboost,watermark
Define ccxt fetch_ohlcv function¶
Get an ohlcv data from public api endpoint. Conmvert it to pandas dataframe with columns and human readable timestamp
def get_ohlcv(exchange, symbol, timeframe):
'''Get an ohlcv data from an exchange
and return it as pandas dataframe.
timeframe = 1m 5m 15m 30m 1h 2h... etc.'''
raw_data = exchange.fetch_ohlcv(symbol, timeframe)
ohlcv_data = pd.DataFrame(raw_data)
ohlcv_data.columns = ["date", "open", "high", "low", "close", "volume"]
ohlcv_data['date'] = pd.to_datetime(ohlcv_data['date'], unit='ms')
return ohlcv_data.set_index('date')
Define CCXT Helper Function¶
To assist with this data retrieval we'll define a function to download and cache datasets from an exchanges. It will use sub-folder whick name we can change below.
def get_ccxt_data(exchange, symbol, timeframe):
'''Download and cache ccxt dataseries'''
local_folder = Path.cwd()
data_folder = "raw_data"
file_name = '{}_{}_{}.pkl'.format(exchange.name, symbol, timeframe).replace('/','-')
cache_path = Path(local_folder, data_folder, file_name)
try:
f = open(cache_path, 'rb')
df = pickle.load(f)
print('Loaded {}_{}_{} from cache'.format(exchange.name, symbol, timeframe))
except (OSError, IOError) as e:
print('Downloading {} {} from {}'.format(symbol, timeframe, exchange.name,))
df = get_ohlcv(exchange, symbol, timeframe)
df.to_pickle(cache_path)
print('Cached {}_{}_{} at {}'.format(exchange.name, symbol, timeframe, cache_path))
return df
Pull Binance Exchange Pricing Data¶
For this notebook we will take 1d candles but you can try other precisions offered by your data provider.
exchange = ccxt.binance()
symbol = 'BTC/USDT'
timeframe = '1d'
While we prepare all wrapping and http calls in above functions we just say:
btc_usdt_price_binance = get_ccxt_data(exchange,symbol,timeframe)
Lets explore what we got.
btc_usdt_price_binance['close'].head(3)
print('There are {} number of days in the dataset.'.format(btc_usdt_price_binance.shape[0]))
Next, we'll generate a simple chart as a quick visual verification that the data looks correct.
btc_usdt_price_binance['close'].plot(figsize = (12, 6))
Collect features¶
dataset_total_df = pd.DataFrame(btc_usdt_price_binance['close'])
FFT¶
We will use Fourier transforms to extract global and local trends, and also to denoise it a little. So let's see how it works.
close_data = pd.DataFrame(btc_usdt_price_binance['close'])
close_data['datetime'] = pd.to_datetime(close_data.index, unit='d')
data_FT = pd.DataFrame(close_data[['datetime', 'close']].values)
data_FT.columns = ['Date', 'Close']
close_fft = np.fft.rfft(np.asarray(data_FT['Close'].tolist()))
fft_df = pd.DataFrame({'fft':close_fft})
fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
fft_df['angle'] = fft_df['fft'].apply(lambda x: np.angle(x))
plt.figure(figsize=(14, 7), dpi = 100)
fft_list = np.asarray(fft_df['fft'].tolist())
for num_ in [3, 6, 9]:
fft_list_m10= np.copy(fft_list); fft_list_m10[num_:-num_] = 0
dataset_total_df['fft{}'.format(num_)] = np.fft.irfft(fft_list_m10)
dataset_total_df.info()
dataset_total_df.plot(figsize = (12, 6))
As you see the more components from the Fourier transform we use the closer the approximation function is to the real stock price (the 9 components transform is closer to the original function - the red and the blue lines almost overlap). We use Fourier transforms for the purpose of extracting long- and short-term trends so we will use the transforms with 3, 6, and 9 components. You can infer that the transform with 3 components serves as the long term trend.
Another technique used to denoise data is call wavelets. Wavelets and Fourier transform gave similar results so we will only use Fourier transforms.
items = deque(np.asarray(fft_df['absolute'].tolist()))
items.rotate(int(np.floor(len(fft_df)/2)))
plt.figure(figsize=(10, 7), dpi=80)
plt.stem(items)
plt.title('Components of Fourier transforms')
plt.show()
ARIMA¶
ARIMA is a technique for predicting time series data. We will show how to use it, and althouth ARIMA will not serve as our final prediction, we will use it as a technique to denoise the stock a little and to (possibly) extract some new patters or features.
series = pd.DataFrame(close_data['close'])
series.index = pd.to_datetime(series.index, unit='D')
model = ARIMA(series, order=(5, 1, 0), missing='drop')
model_fit = model.fit(disp=0)
print(model_fit.summary())
output = model_fit.forecast()
yhat = output[0]
print('Next predicted value {0:.2f}'.format(float(yhat)))
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(series)
X = series.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
model = ARIMA(history, order=(5,1,0), missing='drop')
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
Plot the predicted (from ARIMA) and real prices
plt.figure(figsize=(12, 6), dpi=100)
plt.plot(test, label='Real')
plt.plot(predictions, color='red', label='Predicted')
plt.xlabel('Days')
plt.ylabel('USDT')
plt.title('ARIMA model on BTC/USDT')
plt.legend()
plt.show()
Technical indicators¶
We will create technical indicators for BTC/USDT on Binance.
def get_technical_indicators(dataset):
# Create 7 and 21 days Moving Average
dataset['ma7'] = dataset['close'].rolling(window=7).mean()
dataset['ma21'] = dataset['close'].rolling(window=21).mean()
# Create MACD
dataset['26ema'] = dataset['close'].ewm(span=26).mean()
dataset['12ema'] = dataset['close'].ewm(span=12).mean()
dataset['MACD'] = (dataset['12ema']-dataset['26ema'])
# Create Bollinger Bands
dataset['20sd'] = dataset['close'].rolling(20).std()
dataset['upper_band'] = dataset['ma21'] + (dataset['20sd']*2)
dataset['lower_band'] = dataset['ma21'] - (dataset['20sd']*2)
# Create Exponential moving average
dataset['ema'] = dataset['close'].ewm(com=0.5).mean()
return dataset
dataset_total_df = get_technical_indicators(series)
Features overview¶
print('Total dataset has {} samples, and {} features.'.format(dataset_total_df.shape[0],
dataset_total_df.shape[1]))
dataset_total_df.plot(figsize = (12, 6))
Feature importance with XGBoost¶
Having our features we have to consider whether all of them are really indicative of the direction BTC will take. There are many ways to test feature importance, but the one we will apply uses XGBoost, because it gives one of the best results in both classification and regression problems.
def get_feature_importance_data(data_income):
data = data_income.copy()
y = data['close']
X = data.iloc[:, 1:]
train_samples = int(X.shape[0] * 0.665)
X_train = X.iloc[:train_samples]
X_test = X.iloc[train_samples:]
y_train = y.iloc[:train_samples]
y_test = y.iloc[train_samples:]
return (X_train, y_train), (X_test, y_test)
(X_train_FI, y_train_FI), (X_test_FI, y_test_FI) = get_feature_importance_data(dataset_total_df)
Build XGB regressor
regressor = xgb.XGBRegressor(gamma=0.0,n_estimators=150,base_score=0.7,colsample_bytree=1,learning_rate=0.05)
Traing the regressors and check result
xgbModel = regressor.fit(X_train_FI,y_train_FI, \
eval_set = [(X_train_FI, y_train_FI), (X_test_FI, y_test_FI)], \
verbose=False)
eval_result = regressor.evals_result()
training_rounds = range(len(eval_result['validation_0']['rmse']))
Let's plot the training and validation errors in order to observe the training and check for overfitting (there isn't overfitting).
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.xlabel('Iterations')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()
fig = plt.figure(figsize=(8,8))
plt.xticks(rotation='vertical')
plt.bar([i for i in range(len(xgbModel.feature_importances_))], xgbModel.feature_importances_.tolist(), tick_label=X_test_FI.columns)
plt.title('Feature importance.')
plt.show()
Conclusion¶
We construct nice aproximations with some popoular methods such as Fourier Free Trnasform and so called Technical indicators(popular among traders mean aproximations) into pandas dataframe, fit and evaluate results from XGBoost regressor, we can see that popularity of this methods are correlated with our results. Dont forget that we do that for pothethic "1d" rounded candles while you can use this method to evaluate any kind of time-series data features.
Comments
comments powered by Disqus