HRPその2(使用データ作成)

今回やること

使用データ作成

参考コード

github.com

参考コードの対応部分

stocks = {
    "Apple": "AAPL",
    "Amazon": "AMZN",
    "Alphabet": "GOOG",
    "Microsoft": "MSFT",
    "Facebook": "FB",
    "Alibaba": "BABA",
    "Berkshire Hathaway": "BRK-A",
    "Tencent": "TCEHY",
    "JPMorgan": "JPM",
    "ExxonMobil": "XOM",
    "Johnson & Johnson": "JNJ",
    "Samsung Electronics": "005930.KS",
    "Bank of America": "BAC"
}
stocks = pd.DataFrame(list(stocks.items()), columns=['name', 'symbol'])
ts = TimeSeries(key=config.key, output_format='pandas')
stocks_close = pd.DataFrame()
for symbol in stocks.symbol.values:
    data, _ = ts.get_daily(symbol=symbol, outputsize='full')
    close = data['4. close']
    close.index = pd.to_datetime(close.index)
    stocks_close = stocks_close.append(close)
stocks_close = stocks_close.T
stocks_close = stocks_close.sort_index()
stocks_close = stocks_close.fillna(method='ffill')
stocks_close.columns = stocks.name.values
stocks_close = stocks_close["2015-01-01":"2018-01-01"]
returns = stocks_close.to_returns().dropna()

やること

やることの整理

まずは上記の参考コードでやっている処理を整理。やっていることは恐らく、以下の通り。

  1. stock内の各資産について

    1. alpha_vantageからstocks内のプライス情報を取得

    2. 取得したプライスの終値(close)を抽出

    3. インデックスをdatetime形式へ変換

    4. stock_closeへ追加

  2. 全て結合したstock_closeについて

    1. インデックスでソートしたり

    2. 列名つけたり

    3. 各時点のプライスからリターンへ変換

やること検証

不安だし、せっかくコードがあるので、実行しながら上記の内容があっているか確認

APIのキーは自身で取得したものを引用した

import scipy.cluster.hierarchy as sch
import numpy as np
import pandas as pd
import datetime as date
import matplotlib.pyplot as plt
import cvxopt as opt
from alpha_vantage.timeseries import TimeSeries

stocks = {
    "Apple": "AAPL",
    "Amazon": "AMZN",
    "Alphabet": "GOOG",
    "Microsoft": "MSFT",
    "Facebook": "FB",
    "Alibaba": "BABA",
    "Berkshire Hathaway": "BRK-A",
    "Tencent": "TCEHY",
    "JPMorgan": "JPM",
    "ExxonMobil": "XOM",
    "Johnson & Johnson": "JNJ",
    "Samsung Electronics": "005930.KS",
    "Bank of America": "BAC"
}
stocks = pd.DataFrame(list(stocks.items()), columns=['name', 'symbol'])
ts = TimeSeries(key='自身で取得したやつ', output_format='pandas')
stocks_close = pd.DataFrame()
for symbol in stocks.symbol.values:
    data, _ = ts.get_daily(symbol=symbol, outputsize='full')
    close = data['4. close']
    close.index = pd.to_datetime(close.index)
    stocks_close = stocks_close.append(close)
stocks_close = stocks_close.T
stocks_close = stocks_close.sort_index()
stocks_close = stocks_close.fillna(method='ffill')
stocks_close.columns = stocks.name.values
stocks_close = stocks_close["2015-01-01":"2018-01-01"]
returns = stocks_close.to_returns().dropna()
print (returns.head(10))

結果

ValueError: Thank you for using Alpha Vantage! Our standard API call frequency is 5 calls per minute and 500 calls per day. Please visit https://www.alphavantage.co/premium/ if you would like to target a higher API call frequency.

くそっ、、無料版だとダメか、、、

ということで、資産の数を5つに減らして再チャレンジした結果が以下。

               Apple    Amazon  Alphabet  Microsoft  Facebook
2015-01-05 -0.028172 -0.020517 -0.020846  -0.009303 -0.016061
2015-01-06  0.000094 -0.022833 -0.023177  -0.014571 -0.013473
2015-01-07  0.014022  0.010600 -0.001713   0.012705  0.000000
2015-01-08  0.038422  0.006836  0.003153   0.029418  0.026592
2015-01-09  0.001072 -0.011749 -0.012951  -0.008405 -0.005564
2015-01-12 -0.024641 -0.018590 -0.007296  -0.012503 -0.013121
2015-01-13  0.008879  0.011427  0.007370  -0.005258 -0.003519
2015-01-14 -0.003811 -0.004987  0.009452  -0.008629 -0.002224
2015-01-15 -0.027140 -0.021550  0.001837  -0.010336 -0.029234
2015-01-16 -0.007770  0.013208  0.012535   0.016711  0.015260

間違ってなかった!

実際に用意したデータ

用いたデータ

以下の6資産。期間は2005/1/4~2020/3/9

  • USD/JPY

  • AUD/JPY

  • NZD/USD

  • USD/ZAR

  • Nifty50

  • N225

コード

前作成した(してもらった)関数をそのままおっけー

def read_and_pre_process_asset_df(asset_file_path : str) -> pd.DataFrame :
    asset_name_ext = os.path.basename(asset_file_path)
    asset_name = os.path.splitext(asset_name_ext)[0]

    asset_df = pd.read_csv(asset_file_path, encoding = "utf-8", usecols = ['日付け','終値'])

    asset_df = asset_df.rename(columns = {'日付け' : 'Date', '終値' : asset_name})
    asset_df['Date'] = pd.to_datetime(asset_df['Date'], format = '%Y年%m月%d日')
    asset_df['Date'] = asset_df['Date'].dt.date

    return asset_df

def merge_df(df_list : list, on : str = 'Date') -> pd.DataFrame :
    total_df = df_list[0]
    for df in df_list[1:]:
        total_df = pd.merge(df, total_df, on = on)

    return total_df

def pre_process_total_df(total_df : pd.DataFrame) -> pd.DataFrame :
    total_df = total_df.sort_values(['Date'])
    total_df = total_df.set_index('Date')
    total_df = total_df.astype(str)\
                .applymap(lambda x: x.replace(',', ''))\
                .astype(float)

    return total_df

asset_file_paths = glob.glob('data/*.csv') #  dataディレクトリ直下のcsvファイル読み込み
asset_df_list = [read_and_pre_process_asset_df(asset_file_path)
                 for asset_file_path in asset_file_paths]
asset_total_df = merge_df(asset_df_list)
asset_total_df = pre_process_total_df(asset_total_df)
df_return = asset_total_df.pct_change().dropna()
print(df_return.head(10))

結果

              USDZAR    USDJPY    NZDUSD   Nifty50      N225    AUDJPY
Date                                                                  
2005-01-05  0.017574 -0.004971 -0.005266 -0.034011 -0.006966 -0.007857
2005-01-06  0.032452  0.009415 -0.002003 -0.016657  0.004786  0.004148
2005-01-07 -0.003688 -0.002570 -0.004158  0.008582 -0.005136 -0.006510
2005-01-11 -0.022719 -0.013645  0.004463 -0.031481  0.009337 -0.006804
2005-01-12 -0.001894 -0.009190  0.014046 -0.019697 -0.007504 -0.002537
2005-01-13  0.014610 -0.000098 -0.004523  0.021399 -0.008309 -0.002417
2005-01-14 -0.000331 -0.003711 -0.006815 -0.011998  0.007058 -0.012878
2005-01-17  0.000017  0.000392 -0.004861  0.000932  0.004258 -0.000646
2005-01-18  0.008675  0.002155 -0.002299  0.000595 -0.005558 -0.001422
2005-01-19 -0.005728  0.005181  0.009791 -0.003826 -0.001569  0.010743

終了〜

現在のコード

import scipy.cluster.hierarchy as sch
import numpy as np
import pandas as pd
import datetime as date
import matplotlib.pyplot as plt
import cvxopt as opt
from alpha_vantage.timeseries import TimeSeries
import ffn
import glob

def read_and_pre_process_asset_df(asset_file_path : str) -> pd.DataFrame :
    asset_name_ext = os.path.basename(asset_file_path)
    asset_name = os.path.splitext(asset_name_ext)[0]

    asset_df = pd.read_csv(asset_file_path, encoding = "utf-8", usecols = ['日付け','終値'])

    asset_df = asset_df.rename(columns = {'日付け' : 'Date', '終値' : asset_name})
    asset_df['Date'] = pd.to_datetime(asset_df['Date'], format = '%Y年%m月%d日')
    asset_df['Date'] = asset_df['Date'].dt.date

    return asset_df

def merge_df(df_list : list, on : str = 'Date') -> pd.DataFrame :
    total_df = df_list[0]
    for df in df_list[1:]:
        total_df = pd.merge(df, total_df, on = on)

    return total_df

def pre_process_total_df(total_df : pd.DataFrame) -> pd.DataFrame :
    total_df = total_df.sort_values(['Date'])
    total_df = total_df.set_index('Date')
    total_df = total_df.astype(str)\
                .applymap(lambda x: x.replace(',', ''))\
                .astype(float)

    return total_df

asset_file_paths = glob.glob('data/*.csv') #  dataディレクトリ直下のcsvファイル読み込み
asset_df_list = [read_and_pre_process_asset_df(asset_file_path)
                 for asset_file_path in asset_file_paths]
asset_total_df = merge_df(asset_df_list)
asset_total_df = pre_process_total_df(asset_total_df)

#  収益率を取得
df_return = asset_total_df.pct_change().dropna()