Historical news data missing for 2016

Acording to the News API: News API

Historical (REST)

News API provides historical news data dating back to 2015. You can expect to receive an average of 130+ news articles per day. All news data is currently provided directly by Benzinga.

I see a big chunk of data is missing between 2015-Feb and 2016-Oct

row_num,created_at,updated_at
2547,2016-10-14 10:43:07+00:00,2016-10-14 10:43:07+00:00
2548,2016-10-14 10:38:53+00:00,2016-10-14 10:38:53+00:00
2549,2016-10-14 10:38:22+00:00,2016-10-14 10:38:23+00:00

… Missing Data

2550,2015-02-25 16:01:48+00:00,2015-02-25 16:01:48+00:00
2551,2015-02-25 14:26:21+00:00,2015-02-25 14:26:21+00:00
2552,2015-02-25 12:21:25+00:00,2015-02-25 12:21:25+00:00
2553,2015-02-24 19:28:26+00:00,2015-02-24 19:28:26+00:00
2554,2015-02-24 19:03:52+00:00,2015-02-24 19:03:52+00:00
2555,2015-02-24 16:25:35+00:00,2015-02-24 16:25:35+00:00

python code for news fetching:

import math
import pandas as pd
from datetime import datetime
from transformers import pipeline
from alpaca.common.rest import RESTClient
from decouple import config

api_key = config("ALPACA_KEY")
secret_key = config("ALPACA_SECRET")

news_client = RESTClient(base_url='https://data.alpaca.markets',
                         api_version='v1beta1',
                         api_key=api_key,
                         secret_key=secret_key)

classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

news_endpoint = '/news'
day = datetime.now().day if math.log10(datetime.now().day) >= 1 else f'0{datetime.now().day}'
month = datetime.now().month if math.log10(datetime.now().month) >= 1 else f'0{datetime.now().month}'
year = datetime.now().year
start = datetime.fromisoformat('2015-01-18T00:00:00Z')
end =   datetime.fromisoformat(f'{year}-{month}-{day}T00:00:00Z')

news_entries = {'created_at': [None], 'updated_at': [None], 'headline': [None], 'summary': [None], 'news_type': [None], 'score': [None], 'symbols': [None]}
df_news = pd.DataFrame(news_entries, columns=['created_at', 'updated_at', 'headline', 'summary', 'news_type', 'score', 'symbols'])

page_token = None
while start <= end:
    parameters = {'start': start.isoformat(),
                  'end': end.isoformat(),
                  'page_token': page_token,
                  'symbols': 'JPM',
                  'limit': 50
                  }

    respond = news_client.get(news_endpoint, parameters,)
    page_token = respond.get('next_page_token')

    if page_token is None:
        break
    else:
        news_data = respond['news']
        for news in news_data:
            created_at = news['created_at']
            headline = news['headline']
            summary = news['summary']
            sentiment = classifier(summary + headline)[0]
            score = sentiment['score']
            news_type = sentiment['label']
            updated_at = news['updated_at']
            symbols = news['symbols']
            new_entries = {'created_at': created_at, 'updated_at': updated_at, 'headline': headline,
                           'summary': summary, 'news_type': news_type, 'score': score, 'symbols': symbols}
            row = pd.Series(new_entries)
            df_news = pd.concat([df_news, row.to_frame().T], axis=0, ignore_index=True)
        end = datetime.fromisoformat(min([news['created_at'] for news in respond['news']]))

print(df_news.head(5))
df_news.to_csv('news1.csv')

@Plamen There is news for JPM during the time 2015-Feb and 2016-Oct. Nothing seems to be missing.

Here is the code I used to fetch all news for JPM from 2015-01-18 to the current date.

!pip install -q alpaca-py
from alpaca.common.rest import RESTClient

import pandas as pd

ALPACA_API_KEY_ID = 'xxxxx'
ALPACA_API_SECRET_KEY = 'xxxxx'

# instantiate a basic rest client
news_client = RESTClient(base_url='https://data.alpaca.markets',
                         api_version='v1beta1',
                         api_key=ALPACA_API_KEY_ID,
                         secret_key=ALPACA_API_SECRET_KEY)

# create a dataframe to store the news
news_df = pd.DataFrame()
page_token = 'default'

while page_token is not None:

  parameters = {'start': pd.to_datetime('2015-01-18T00:00:00Z').isoformat(),
                'end': pd.to_datetime('today', utc=True).isoformat(),
                'page_token': page_token if page_token!='default' else None,
                'symbols': 'JPM',
                'limit': 50
                }
  response = news_client.get('/news', parameters)
  news_df = pd.concat([news_df, pd.DataFrame(response.get('news'))])
  page_token = response.get('next_page_token')

news_df is a dataframe of all news up to the current day. One can then plot the number of article by month 2015-Feb thru 2016-Oct (I actually plotted a bit larger window). Here’s the code

# set the index to the created_at column to a datatime
news_df.set_index(pd.to_datetime(news_df.created_at, utc=True), inplace=True)

# narrow the window we want to plot
check_start = pd.to_datetime('2015-02-01', utc=True)
check_end = pd.to_datetime('2017-01-01', utc=True)

# plot the qty of articles per month between our check dates
news_df.query('@check_start < index < @check_end').resample('1M').content.count().plot.bar()

Here is the chart showing generally over 20 articles per month throughout 2015-2016.

Thanks Dan, yes the data is there, It seams the page_token is used in wrong way in my code.

I just realised that alpaca-py liblary returns more rows data than the rest version of alpaca-trade-api

import warnings
import pandas as pd
import math
from datetime import datetime
from transformers import pipeline
from decouple import config
from alpaca.data.historical.news import NewsClient
from alpaca.data.requests import NewsRequest

warnings.filterwarnings("ignore")

api_key = config("ALPACA_KEY")
secret_key = config("ALPACA_SECRET")
news_url = config("ALPACA_DATA_URL") # 'https://data.alpaca.markets'

client = NewsClient(api_key, secret_key, url_override=news_url)

classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

day = datetime.now().day if math.log10(datetime.now().day) >= 1 else f'0{datetime.now().day}'
month = datetime.now().month if math.log10(datetime.now().month) >= 1 else f'0{datetime.now().month}'
year = datetime.now().year

start = datetime.fromisoformat('2015-01-01T00:00:00Z')
end = datetime.fromisoformat(f'{year}-{month}-{day}T00:00:00Z')
news_entries = {'created_at': [], 'updated_at': [], 'headline': [], 'summary': [], 'url':[], 'news_type': [], 'score': [], 'symbols': []}
df_news = pd.DataFrame(news_entries, columns=['created_at', 'updated_at', 'headline', 'summary', 'url', 'news_type', 'score', 'symbols'])

page_token = 'default'
while page_token is not None:

    request_params = NewsRequest(
        start=start.isoformat(),
        end=end.isoformat(),
        symbols='JPM',
        limit=50,
        page_token=page_token if page_token!='default' else None
    )

    respond = client.get_news(request_params)

    news_data = respond.news
    for news in news_data:
        created_at = news.created_at.isoformat().split('+')[0].replace('T',' ')
        headline = news.headline
        url = news.url
        summary = news.summary
        sentiment = classifier(summary + headline)[0]
        score = sentiment['score']
        news_type = sentiment['label']
        updated_at = news.updated_at.isoformat().split('+')[0].replace('T',' ')
        symbols = news.symbols
        entries = {'created_at': created_at, 'updated_at': updated_at, 'headline': headline, 'summary': summary, 'url': url, 'news_type': news_type, 'score': score, 'symbols': symbols }
        row = pd.Series(entries)
        df_news = pd.concat([df_news, row.to_frame().T], axis=0, ignore_index=True)
    page_token = respond.next_page_token

print(df_news.head(5))
df_news.to_csv('news.csv')