Historical news data missing for 2016

Acording to the News API: News API

Historical (REST)

News API provides historical news data dating back to 2015. You can expect to receive an average of 130+ news articles per day. All news data is currently provided directly by Benzinga.

I see a big chunk of data is missing between 2015-Feb and 2016-Oct

2547,2016-10-14 10:43:07+00:00,2016-10-14 10:43:07+00:00
2548,2016-10-14 10:38:53+00:00,2016-10-14 10:38:53+00:00
2549,2016-10-14 10:38:22+00:00,2016-10-14 10:38:23+00:00

… Missing Data

2550,2015-02-25 16:01:48+00:00,2015-02-25 16:01:48+00:00
2551,2015-02-25 14:26:21+00:00,2015-02-25 14:26:21+00:00
2552,2015-02-25 12:21:25+00:00,2015-02-25 12:21:25+00:00
2553,2015-02-24 19:28:26+00:00,2015-02-24 19:28:26+00:00
2554,2015-02-24 19:03:52+00:00,2015-02-24 19:03:52+00:00
2555,2015-02-24 16:25:35+00:00,2015-02-24 16:25:35+00:00

python code for news fetching:

import math
import pandas as pd
from datetime import datetime
from transformers import pipeline
from alpaca.common.rest import RESTClient
from decouple import config

api_key = config("ALPACA_KEY")
secret_key = config("ALPACA_SECRET")

news_client = RESTClient(base_url='https://data.alpaca.markets',

classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

news_endpoint = '/news'
day = datetime.now().day if math.log10(datetime.now().day) >= 1 else f'0{datetime.now().day}'
month = datetime.now().month if math.log10(datetime.now().month) >= 1 else f'0{datetime.now().month}'
year = datetime.now().year
start = datetime.fromisoformat('2015-01-18T00:00:00Z')
end =   datetime.fromisoformat(f'{year}-{month}-{day}T00:00:00Z')

news_entries = {'created_at': [None], 'updated_at': [None], 'headline': [None], 'summary': [None], 'news_type': [None], 'score': [None], 'symbols': [None]}
df_news = pd.DataFrame(news_entries, columns=['created_at', 'updated_at', 'headline', 'summary', 'news_type', 'score', 'symbols'])

page_token = None
while start <= end:
    parameters = {'start': start.isoformat(),
                  'end': end.isoformat(),
                  'page_token': page_token,
                  'symbols': 'JPM',
                  'limit': 50

    respond = news_client.get(news_endpoint, parameters,)
    page_token = respond.get('next_page_token')

    if page_token is None:
        news_data = respond['news']
        for news in news_data:
            created_at = news['created_at']
            headline = news['headline']
            summary = news['summary']
            sentiment = classifier(summary + headline)[0]
            score = sentiment['score']
            news_type = sentiment['label']
            updated_at = news['updated_at']
            symbols = news['symbols']
            new_entries = {'created_at': created_at, 'updated_at': updated_at, 'headline': headline,
                           'summary': summary, 'news_type': news_type, 'score': score, 'symbols': symbols}
            row = pd.Series(new_entries)
            df_news = pd.concat([df_news, row.to_frame().T], axis=0, ignore_index=True)
        end = datetime.fromisoformat(min([news['created_at'] for news in respond['news']]))


@Plamen There is news for JPM during the time 2015-Feb and 2016-Oct. Nothing seems to be missing.

Here is the code I used to fetch all news for JPM from 2015-01-18 to the current date.

!pip install -q alpaca-py
from alpaca.common.rest import RESTClient

import pandas as pd


# instantiate a basic rest client
news_client = RESTClient(base_url='https://data.alpaca.markets',

# create a dataframe to store the news
news_df = pd.DataFrame()
page_token = 'default'

while page_token is not None:

  parameters = {'start': pd.to_datetime('2015-01-18T00:00:00Z').isoformat(),
                'end': pd.to_datetime('today', utc=True).isoformat(),
                'page_token': page_token if page_token!='default' else None,
                'symbols': 'JPM',
                'limit': 50
  response = news_client.get('/news', parameters)
  news_df = pd.concat([news_df, pd.DataFrame(response.get('news'))])
  page_token = response.get('next_page_token')

news_df is a dataframe of all news up to the current day. One can then plot the number of article by month 2015-Feb thru 2016-Oct (I actually plotted a bit larger window). Here’s the code

# set the index to the created_at column to a datatime
news_df.set_index(pd.to_datetime(news_df.created_at, utc=True), inplace=True)

# narrow the window we want to plot
check_start = pd.to_datetime('2015-02-01', utc=True)
check_end = pd.to_datetime('2017-01-01', utc=True)

# plot the qty of articles per month between our check dates
news_df.query('@check_start < index < @check_end').resample('1M').content.count().plot.bar()

Here is the chart showing generally over 20 articles per month throughout 2015-2016.

Thanks Dan, yes the data is there, It seams the page_token is used in wrong way in my code.

I just realised that alpaca-py liblary returns more rows data than the rest version of alpaca-trade-api

import warnings
import pandas as pd
import math
from datetime import datetime
from transformers import pipeline
from decouple import config
from alpaca.data.historical.news import NewsClient
from alpaca.data.requests import NewsRequest


api_key = config("ALPACA_KEY")
secret_key = config("ALPACA_SECRET")
news_url = config("ALPACA_DATA_URL") # 'https://data.alpaca.markets'

client = NewsClient(api_key, secret_key, url_override=news_url)

classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

day = datetime.now().day if math.log10(datetime.now().day) >= 1 else f'0{datetime.now().day}'
month = datetime.now().month if math.log10(datetime.now().month) >= 1 else f'0{datetime.now().month}'
year = datetime.now().year

start = datetime.fromisoformat('2015-01-01T00:00:00Z')
end = datetime.fromisoformat(f'{year}-{month}-{day}T00:00:00Z')
news_entries = {'created_at': [], 'updated_at': [], 'headline': [], 'summary': [], 'url':[], 'news_type': [], 'score': [], 'symbols': []}
df_news = pd.DataFrame(news_entries, columns=['created_at', 'updated_at', 'headline', 'summary', 'url', 'news_type', 'score', 'symbols'])

page_token = 'default'
while page_token is not None:

    request_params = NewsRequest(
        page_token=page_token if page_token!='default' else None

    respond = client.get_news(request_params)

    news_data = respond.news
    for news in news_data:
        created_at = news.created_at.isoformat().split('+')[0].replace('T',' ')
        headline = news.headline
        url = news.url
        summary = news.summary
        sentiment = classifier(summary + headline)[0]
        score = sentiment['score']
        news_type = sentiment['label']
        updated_at = news.updated_at.isoformat().split('+')[0].replace('T',' ')
        symbols = news.symbols
        entries = {'created_at': created_at, 'updated_at': updated_at, 'headline': headline, 'summary': summary, 'url': url, 'news_type': news_type, 'score': score, 'symbols': symbols }
        row = pd.Series(entries)
        df_news = pd.concat([df_news, row.to_frame().T], axis=0, ignore_index=True)
    page_token = respond.next_page_token
