import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import time
import nltk.data
from nltk import tokenize
import re
columns = ['Year', 'Week Start', 'Week End', 'Section', 'Number', 'Headline', 'Body Text']
articles_df = pd.read_csv('https://s3.amazonaws.com/cs109data/articles_db.csv', names=columns)
articles_df.head(100)
articles_df.shape
# Dictionary of relevant words
relevant_words = np.genfromtxt('Keywords.txt', dtype='str')
def find_relevant(text, n):
text = str(text)
num_relevant_words = [word for word in relevant_words if ((' '+word+' ') in text)]
if len(num_relevant_words) > n:
return True
else:
return False
text = articles_df['Body Text'].values[0]
plt.figure(figsize=(10,5))
for n in [3,5,7]:
relevant_articles = [find_relevant(text, n) for text in articles_df['Body Text'].values]
relevant_df = articles_df[relevant_articles]
weekly_articles = relevant_df.groupby('Week Start').size().reset_index()
plt.plot(weekly_articles[0], label='n='+str(n),)
plt.legend(loc='best')
plt.ylabel('Number of relevant articles')
axes = plt.gca()
axes.set_ylim([0,50])
n = 3
relevant_articles = [find_relevant(text, n) for text in articles_df['Body Text'].values]
relevant_df = articles_df[relevant_articles]
articles_per_section = relevant_df.groupby(['Week Start', 'Section']).size().reset_index()
plt.figure(figsize=(15,5))
for section in articles_per_section['Section'].unique():
articles_count = articles_per_section[articles_per_section['Section'] == section]
articles_count.head()
plt.plot(range(0, len(articles_count[0])), articles_count[0], label=section)
plt.xlabel('Week', fontsize=20)
plt.ylabel('Number of relevant articles', fontsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.grid(True)
plt.legend()
axes = plt.gca()
axes.set_ylim([0,30])
# Source code for sentwordinet: http://www.nltk.org/_modules/nltk/corpus/reader/sentiwordnet.html
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
# A simple function to obtain the overall sentimate of a text chunk
# Method: tokenise the text chunk, obtain the sentiment score of each token, then take mean average.
# Note: you may need to separately install sentiwordnet: nltk.download('sentiwordnet')
## synsets based on context
## phrases/tokens
## classify words as noun/adjectives
# unsupervised split between adjectives
def simple_sentiment(text_chunk):
cumulative_pos_sentiment = 0
cumulative_neg_sentiment = 0
index = 0
# Tokenizing the sample text
tokens=nltk.word_tokenize(text_chunk)
# Removing words of lenght 2 or less
tokens = [i for i in tokens if len(i)>=3]
# remove stop words
tokens = [word for word in tokens if word not in stopwords.words('english')]
# a/n/v/r represent adjective/noun/verb/adverb respectively. They are used to index the sentinet dictionary.
for i in tokens:
if len(list(swn.senti_synsets(i, 'a')))>0:
cumulative_pos_sentiment += list(swn.senti_synsets(i, 'a'))[0].pos_score()
cumulative_neg_sentiment += list(swn.senti_synsets(i, 'a'))[0].neg_score()
index +=1
elif len(list(swn.senti_synsets(i, 'n')))>0:
cumulative_pos_sentiment += list(swn.senti_synsets(i, 'n'))[0].pos_score()
cumulative_neg_sentiment += list(swn.senti_synsets(i, 'n'))[0].neg_score()
index +=1
elif len(list(swn.senti_synsets(i, 'v')))>0:
cumulative_pos_sentiment += list(swn.senti_synsets(i, 'v'))[0].pos_score()
cumulative_neg_sentiment += list(swn.senti_synsets(i, 'v'))[0].neg_score()
index +=1
elif len(list(swn.senti_synsets(i, 'r')))>0:
cumulative_pos_sentiment += list(swn.senti_synsets(i, 'r'))[0].pos_score()
cumulative_neg_sentiment += list(swn.senti_synsets(i, 'r'))[0].neg_score()
index +=1
avg_pos_sentiment = cumulative_pos_sentiment / float((1 if (index == 0) else index))
avg_neg_sentiment = cumulative_neg_sentiment / float((1 if (index == 0) else index))
# print('Positive sentiment:',avg_pos_sentiment)
# print('Negative sentiment:',avg_neg_sentiment)
return (avg_pos_sentiment,avg_neg_sentiment)
sample_text = 'There\'s no time to waste,"Over the past few months, President Clinton has lost few opportunities to sing the praises of his favourite book. In November, he told a conference attended by Tony Blair that it was no longer necessary to choose between growth and environment. He took as evidence Natural Capitalism, The Next Industrial Revolution (Paul Hawken and Amory and Hunter Lovins, Earthscan, pounds 18.99), which \'proves beyond argument that there are presently available technologies, and those just on the horizon, which will permit us to get richer by cleaning, not by spoiling, the environment. This is a huge deal,\' Clinton said. It\'s a suitably millennial claim. The authors argue that \'capitalism, as practised, is a financially profitable, nonsustainable aberration in human development... [which] does not fully conform to its own accounting principles. It liquidates its capital and calls it income. It neglects to assign any value to the largest stocks of capital it employs, the natural resources and living systems, as well as the social and cultural systems that are the basis of human capital.\' Companies, as has been well said, are brilliant externalising machines, pocketing the profits and shunting the costs of their enterprise on to the collectivity. Thus, the NHS pays for the profits of big tobacco, and the Government subsidises cars by building roads. Put it another way, business is a free rider on the environment and the services it provides, services which have been tentatively valued by Nature magazine at $36 trillion annually, roughly the same as world GDP. The reason business is so profligate with the the environment (the \'natural capital\' of the book) is that its goods are assumed by economists to be free and infinitely substitutable. So they are uncosted. But in reality they are not free. They are produced by the earth\'s 3.8-billion-year store of natural capital which, as the authors rehearse with hair-raising thoroughness, is being eroded so fast that by the end of this century there will be little left. And there is no conceivable substitute, for example, for the biosphere\'s ability to produce oxygen. The authors manage to recast this rush to disaster as a story with a (potentially) happier ending. Their grounds for optimism are partly familiar American technological optimism, if natural resources were treated as scarce and expensive, then nanotechnology and biotechnology could multiply four or even tenfold the outputs from today\'s inputs. Hence Clinton\'s enthusiasm. But more crucial to the project is a complete mental flip of what an \'output\' consists of (as Edwin Land once said, a great idea is often \'not having a new thought but stopping having an old one\'). At present, it is entirely conceivable that one-quarter or even half of the GDP of advanced countries makes not value but waste. Most industrial processes, and the economy as a whole, are inefficient , at best achieving 10 per cent of their potential likewise their products. A car uses just 1 per cent of the energy it burns to propel the driver, the rest to warm the atmosphere, deafen pedestrians and shift ponderous steel boxes between traffic jams. Moreover, waste is cumulative, so an increasing income has to be spent on alleviating growth\'s byproducts, pollution, traffic accidents and congestion, crime. Hence the phenomenon of uneconomic growth, where increases in nominal wealth produce no net gain in quality of life or standard of living: in real terms 80 per cent of Americans are no better off than they were in 1979. However, the grossness of the waste is, say the authors, also a measure of the huge potential for improvement if the spiral changed to virtuous. The secret is taking a systems view in which it is always more expensive to get rid of waste than to design it out in the first place. Given the wastefulness of most current systems, improvements of 10 to 100 times in overall efficiency are possible even with existing technology. Much of what the Lovins and Hawken propose is not new. Frances Cairncross wrote about costing the earth 10 years ago, and Richard Schonberger coined the term \'frugal manufacturing\' in the 1980s. What is new is the way these ideas are brought together in a systems approach to business and the environment, and the coopting of markets as the mechanism which can be used to turn things around. There is some irony here, of course. The greatest obstacle to \'natural capitalism\' in practice will be the vested interests and special pleading of those most vociferous champions of capitalist orthodoxy, US companies, which emerge from this book the masters of the perverse, not to mention grotesque, hidden subsidy, whether of agriculture, cars, or their wealthy executives. Persuading them to confront their own bad faith will be no easy matter. But, as someone once said, the economy is a wholly-owned subsidiary of the environment, and time is running out for the parent to bring it to heel.'
simple_sentiment(sample_text)
n = 3
relevant_articles = [find_relevant(text, n) for text in articles_df['Body Text'].values]
relevant_df = articles_df[relevant_articles]
weeks = relevant_df['Week Start'].unique()
avg_weekly_pos_score = np.zeros((len(weeks), 1))
avg_weekly_neg_score = np.zeros((len(weeks), 1))
avg_weekly_pos_minus_neg_score = np.zeros((len(weeks), 1))
# Calculate weekly sentiment scores across the entire time period
weeks = relevant_df['Week Start'].unique()
for i, week in enumerate(weeks):
articles = relevant_df[relevant_df['Week Start'] == week]['Body Text']
num_articles = articles.shape[0]
pos_score = 0
neg_score = 0
for article in articles:
pos, neg = simple_sentiment(article)
pos_score += pos
neg_score += neg
avg_weekly_pos_score[i] = (pos_score/float(num_articles))
avg_weekly_neg_score[i] = (neg_score/float(num_articles))
avg_weekly_pos_minus_neg_score[i] = avg_weekly_pos_score[i] - avg_weekly_neg_score[i]
if (i%10 == 0):
print('Week: ', week, 'Postive: ', avg_weekly_pos_score[i][0], 'Negative: ', avg_weekly_neg_score[i][0])
# Saving file to not recalculate every time
# scores_df = pd.DataFrame()
# scores_df['weeks']=weeks
# scores_df['avg_weekly_pos_score']=avg_weekly_pos_score
# scores_df['avg_weekly_neg_score']=avg_weekly_neg_score
# scores_df.to_csv('scores_df.csv',index=False)
scores_df = pd.read_csv('scores_df.csv')
avg_weekly_pos_score = scores_df['avg_weekly_pos_score']
avg_weekly_neg_score = scores_df['avg_weekly_neg_score']
plt.figure(figsize=(15, 10))
plt.plot(scores_df['avg_weekly_pos_score'])
plt.xlabel('Week', fontsize=20)
plt.ylabel('Average weekly postive sentiment score', fontsize=20)
plt.figure(figsize=(15, 10))
plt.plot(scores_df['avg_weekly_neg_score'],'red')
plt.xlabel('Week', fontsize=20)
plt.ylabel('Average weekly negative sentiment score', fontsize=20)
daily_data = pd.read_csv('daily_rates.csv', skiprows=3, header=0)
monthly_data = pd.read_csv('monthly_rates.csv', skiprows=11, header=0)
daily_data['datetime'] = pd.to_datetime(daily_data['DATE'])
monthly_data['datetime'] = pd.to_datetime(monthly_data['DATE'])
daily_data['dayofweek'] = daily_data['datetime'].apply(lambda row: row.dayofweek)
weekly_data = daily_data[daily_data['dayofweek'] == 4]
timestamp_weeks = [pd.to_datetime(week) for week in weeks]
fig, ax1 = plt.subplots( figsize=(20,15))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLERS'], 'brown', linewidth=2, label=str('EUR/GBP'))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLUSS'], 'blue', linewidth=2, label=str('USD/GBP'))
ax1.legend(loc='best', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.set_ylabel('Euro and US Dollar to Pound exchange rate', fontsize=20)
ax1.grid(True)
ax1.set_ylim([min(min(weekly_data['XUDLERS']),min(weekly_data['XUDLUSS'])),max(max(weekly_data['XUDLERS']),max(weekly_data['XUDLUSS']))])
ax1.axvline(x=datetime.datetime(2016,1,8), color='grey', linewidth=2)
ax1.axvline(x=datetime.datetime(2007,1,5), color='orange', linewidth=2)
ax1.axvline(x=datetime.datetime(2009,1,12), color='orange', linewidth=2)
ax2 = ax1.twinx()
ax2.plot(timestamp_weeks, scores_df['avg_weekly_pos_score'], 'green',linewidth=0.5, label = 'Average weekly positive score')
ax2.set_ylabel('Positive Sentiment Score', color='green',fontsize=20)
ax2.set_ylim([0.0,0.1])
for tl in ax2.get_yticklabels():
tl.set_color('green')
plt.show()
fig, ax1 = plt.subplots( figsize=(20,15))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLERS'], 'brown', linewidth=2, label=str('EUR/GBP'))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLUSS'], 'blue', linewidth=2, label=str('USD/GBP'))
ax1.legend(loc='best', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.set_ylabel('Euro and US Dollar to Pound exchange rate', fontsize=20)
ax1.grid(True)
ax1.set_ylim([min(min(weekly_data['XUDLERS']),min(weekly_data['XUDLUSS'])),max(max(weekly_data['XUDLERS']),max(weekly_data['XUDLUSS']))])
ax1.axvline(x=datetime.datetime(2016,1,8), color='grey', linewidth=2)
ax1.axvline(x=datetime.datetime(2007,1,5), color='orange', linewidth=2)
ax1.axvline(x=datetime.datetime(2009,1,12), color='orange', linewidth=2)
ax2 = ax1.twinx()
ax2.plot(timestamp_weeks, scores_df['avg_weekly_neg_score'], 'red',linewidth=0.5, label = 'Average weekly negative score')
ax2.set_ylabel('Negative Sentiment Score', color='red',fontsize=20)
ax2.set_ylim([0.0,0.1])
for tl in ax2.get_yticklabels():
tl.set_color('green')
plt.show()
For variance reduction/ smoothing the sentiment scores, we take the running average of preceding ten weeks as our sentiment time series. Then there are visible trends between exchange rates and the net sentiments.
def running_average_smoother(list, parameter):
return([np.mean(list[k-(parameter-1):k+1]) for k in range(parameter-1,len(list))])
# Example plot: with the centred running average of ten precedent sentiment scores being used to smoothen sentiment score
# Smoothing parameter
smooth = 10
fig, ax1 = plt.subplots( figsize=(20,15))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLUSS'], 'blue', linewidth=2, label=str('USD/GBP'))
ax1.legend(loc='best', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.set_ylabel('Euro and US Dollar to Pound exchange rate', fontsize=20)
ax1.grid(True)
ax1.set_ylim([min(min(weekly_data['XUDLERS']),min(weekly_data['XUDLUSS'])),max(max(weekly_data['XUDLERS']),max(weekly_data['XUDLUSS']))])
ax1.axvline(x=datetime.datetime(2016,1,8), color='grey', linewidth=2)
ax1.axvline(x=datetime.datetime(2007,1,5), color='orange', linewidth=2)
ax1.axvline(x=datetime.datetime(2009,1,12), color='orange', linewidth=2)
avg_weekly_pos_score_smooth = running_average_smoother(avg_weekly_pos_score,smooth)
avg_weekly_neg_score_smooth = running_average_smoother(avg_weekly_neg_score,smooth)
ax2 = ax1.twinx()
ax2.plot(timestamp_weeks[smooth-1:], avg_weekly_pos_score_smooth, 'green',linewidth=1, label = 'Weekly positive score')
ax2.plot(timestamp_weeks[smooth-1:], [-1*avg_weekly_neg_score_smooth[k] for k in range(0,len(avg_weekly_neg_score_smooth))], 'red',linewidth=1, label = 'Weekly negative score')
ax2.legend(loc='best', fontsize=20)
ax2.set_ylabel('Smoothened Sentiment Scores', color='grey',fontsize=20)
ax2.set_ylim([-0.08,0.08])
for tl in ax2.get_yticklabels():
tl.set_color('grey')
plt.show()
Comment: The correlation between the sentiment scores and the exchange rate changes are somewhat visible.
# Calculating the time series correlation: note we shift dates by 3 days for matching
# (we are working at a weekly level)
weekly_data_shifted = weekly_data['datetime'] + datetime.timedelta(days=3)
ts1 = []
ts2 = []
for k in weekly_data_shifted:
if k in timestamp_weeks[9:]:
ts1.append(weekly_data['XUDLERS'][weekly_data_shifted==k])
ts2.append(avg_weekly_pos_score_smooth[timestamp_weeks[9:].index(k)])
ts1 = [ts1[i].values[0] for i in range(0,len(ts1))]
print(np.corrcoef(ts1,ts2)[0][1])
# A function for calculating the time series correlation for different running average smoothing parameters
def correlation_function(smoothing_parameter,avg_weekly_pos_score,avg_weekly_neg_score):
# We shift dates by 3 days for matching (we are working at a weekly level)
weekly_data_shifted = weekly_data['datetime'] + datetime.timedelta(days=3)
ts0 = []
ts1 = []
ts2 = []
ts3 = []
for k in weekly_data_shifted:
if (k in timestamp_weeks[smoothing_parameter:]):
if (timestamp_weeks[smoothing_parameter:].index(k) <= len(timestamp_weeks[smoothing_parameter:]) -1 ):
ts0.append(weekly_data['XUDLUSS'][weekly_data_shifted==k])
ts1.append(weekly_data['XUDLERS'][weekly_data_shifted==k])
ts2.append(running_average_smoother(avg_weekly_pos_score,smoothing_parameter)[timestamp_weeks[smoothing_parameter:].index(k)])
ts3.append(running_average_smoother(avg_weekly_neg_score,smoothing_parameter)[timestamp_weeks[smoothing_parameter:].index(k)])
ts0 = [ts0[i].values[0] for i in range(0,len(ts0))]
ts1 = [ts1[i].values[0] for i in range(0,len(ts1))]
usd_pos_score_corr = np.corrcoef(ts0,ts2)[0][1]
usd_neg_score_corr = np.corrcoef(ts0,ts3)[0][1]
eur_pos_score_corr = np.corrcoef(ts1,ts2)[0][1]
eur_neg_score_corr = np.corrcoef(ts1,ts3)[0][1]
print('Correlation values are:','GBPUSD with Pos scores:',usd_pos_score_corr,
'GBPUSD with Neg scores:', usd_neg_score_corr, 'GBPEUR with Pos scores:', eur_pos_score_corr,
'GBPEUR with Neg scores:', eur_neg_score_corr)
return(usd_pos_score_corr,usd_neg_score_corr,eur_pos_score_corr,eur_neg_score_corr)
correlation_scores = [correlation_function(k,avg_weekly_pos_score,avg_weekly_neg_score) for k in range(10,100,5)]
correlation_scores_USD_pos = np.asarray(correlation_scores)[:,0]
correlation_scores_USD_neg = np.asarray(correlation_scores)[:,1]
correlation_scores_EUR_pos = np.asarray(correlation_scores)[:,2]
correlation_scores_EUR_neg = np.asarray(correlation_scores)[:,3]
fig, ax1 = plt.subplots( figsize=(10,10) )
ax1.plot(correlation_scores_USD_pos, 'yellow',label = 'USD and Pos Sentiment Correlation')
ax1.plot(correlation_scores_USD_neg, 'red',label = 'USD and Neg Sentiment Correlation')
ax1.plot(correlation_scores_EUR_pos, 'green',label = 'EUR and Pos Sentiment Correlation')
ax1.plot(correlation_scores_EUR_neg, 'orange',label = 'EUR and Neg Sentiment Correlation')
ax1.set_ylabel('Correlation', fontsize=20)
ax1.legend(loc='best', fontsize=20)
ax1.set_ylim([-1,1])
[k for k in range(10,100,5)][np.argmin(correlation_scores_USD_neg)]
From the plot above, we note: negative sentiment is correlated with GBPUSD, and postive sentiment is correlated with GBPEUR. We use the negative sentiment correlation score with GBPUSD to determine 60 as the optimal smoothing parameter.
# Example plot: with the centred running average of ten precedent sentiment scores being used to smoothen sentiment score
# Smoothing parameter
smooth = 60
fig, ax1 = plt.subplots( figsize=(20,15))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLUSS'], 'blue', linewidth=2, label=str('USD/GBP'))
ax1.legend(loc='best', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.set_ylabel('Euro and US Dollar to Pound exchange rate', fontsize=20)
ax1.grid(True)
ax1.set_ylim([min(min(weekly_data['XUDLERS']),min(weekly_data['XUDLUSS'])),max(max(weekly_data['XUDLERS']),max(weekly_data['XUDLUSS']))])
ax1.axvline(x=datetime.datetime(2016,1,8), color='grey', linewidth=2)
ax1.axvline(x=datetime.datetime(2007,1,5), color='orange', linewidth=2)
ax1.axvline(x=datetime.datetime(2009,1,12), color='orange', linewidth=2)
avg_weekly_pos_score_smooth = running_average_smoother(avg_weekly_pos_score,smooth)
avg_weekly_neg_score_smooth = running_average_smoother(avg_weekly_neg_score,smooth)
ax2 = ax1.twinx()
ax2.plot(timestamp_weeks[smooth-1:], avg_weekly_pos_score_smooth, 'green',linewidth=1, label = 'Weekly positive score')
ax2.plot(timestamp_weeks[smooth-1:], [-1*avg_weekly_neg_score_smooth[k] for k in range(0,len(avg_weekly_neg_score_smooth))], 'red',linewidth=1, label = 'Weekly negative score')
ax2.legend(loc='best', fontsize=20)
ax2.set_ylabel('Smoothened Sentiment Scores', color='grey',fontsize=20)
ax2.set_ylim([-0.08,0.08])
for tl in ax2.get_yticklabels():
tl.set_color('grey')
plt.show()
weeks = relevant_df['Week Start'].unique()
avg_weekly_pos_score_fl20 = np.zeros((len(weeks), 1))
avg_weekly_neg_score_fl20 = np.zeros((len(weeks), 1))
for i, week in enumerate(weeks):
# First and last twenty words
articles = relevant_df[relevant_df['Week Start'] == week]['Body Text']
num_articles = articles.shape[0]
pos_score_fl20 = 0
neg_score_fl20 = 0
for article in articles:
# Only keeping the first and last 20 words
article = ' '.join(article.split()[0:20] + article.split()[-20:])
pos, neg = simple_sentiment(article)
pos_score_fl20 += pos
neg_score_fl20 += neg
avg_weekly_pos_score_fl20[i] = (pos_score_fl20/float(num_articles))
avg_weekly_neg_score_fl20[i] = (neg_score_fl20/float(num_articles))
if (i%10 == 0):
print('Week: ', week, 'Postive: ', avg_weekly_pos_score[i][0], 'Negative: ', avg_weekly_neg_score[i][0])
# Example plot: with the centred running average of ten precedent sentiment scores being used to smoothened sentiment score
# Smoothing parameter set to 60
smooth = 60
fig, ax1 = plt.subplots( figsize=(20,15))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLERS'], 'brown', linewidth=2, label=str('EUR/GBP'))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLUSS'], 'blue', linewidth=2, label=str('USD/GBP'))
ax1.legend(loc='best', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.set_ylabel('Euro and US Dollar to Pound exchange rate', fontsize=20)
ax1.grid(True)
ax1.set_ylim([min(min(weekly_data['XUDLERS']),min(weekly_data['XUDLUSS'])),max(max(weekly_data['XUDLERS']),max(weekly_data['XUDLUSS']))])
ax1.axvline(x=datetime.datetime(2016,1,8), color='grey', linewidth=2)
ax1.axvline(x=datetime.datetime(2007,1,5), color='orange', linewidth=2)
ax1.axvline(x=datetime.datetime(2009,1,12), color='orange', linewidth=2)
avg_weekly_pos_score_smooth = running_average_smoother(avg_weekly_pos_score_fl20,smooth)
avg_weekly_neg_score_smooth = running_average_smoother(avg_weekly_neg_score_fl20,smooth)
ax2 = ax1.twinx()
ax2.plot(timestamp_weeks[smooth-1:], avg_weekly_pos_score_smooth, 'green',linewidth=1, label = 'Weekly positive score')
ax2.plot(timestamp_weeks[smooth-1:], [-1*avg_weekly_neg_score_smooth[k] for k in range(0,len(avg_weekly_neg_score_smooth))], 'red',linewidth=1, label = 'Weekly negative score')
ax2.set_ylabel('Smoothened Sentiment Scores', color='grey',fontsize=20)
ax2.set_ylim([-0.08,0.08])
for tl in ax2.get_yticklabels():
tl.set_color('grey')
plt.show()
Let us compare the correlation of the two models (one only looking at sentiment of first/last 20 words and the other looking at the sentiment of the full article)
print('Full article sentiment correlation: \n', correlation_function(60,avg_weekly_pos_score,avg_weekly_neg_score),
'\n First/Last 20 words sentiment correlation: \n', correlation_function(60,avg_weekly_pos_score_fl20,avg_weekly_neg_score_fl20),)
# First/ last 20 words sentiment analysis function:
def simple_sentiment_fl20(text_chunk):
# Only keeping the first/ last 20 words
text_chunk = ' '.join(text_chunk.split()[0:20] + text_chunk.split()[-20:])
return simple_sentiment(text_chunk)
# A function to remove the neutral sentences from a given text
def remove_neu_sentences(text, neutral_threshold):
sentences = tokenize.sent_tokenize(text)
output_text = []
for sentence in sentences:
if (1-sum(simple_sentiment(sentence))) < neutral_threshold :
output_text.append(sentence)
return ' '.join(output_text)
# A function for calculating the sentiment scores with the neutral setences removed
def sentiscores_neu_removed(neutral_threshold):
weeks = relevant_df['Week Start'].unique()
avg_weekly_pos_score = np.zeros((len(weeks), 1))
avg_weekly_neg_score = np.zeros((len(weeks), 1))
for i, week in enumerate(weeks):
articles = relevant_df[relevant_df['Week Start'] == week]['Body Text']
num_articles = 0
pos_score = 0
neg_score = 0
for article in articles:
article_neutral_removed = remove_neu_sentences(article,neutral_threshold)
if (len(article_neutral_removed) > 2):
num_articles += 1
pos, neg = simple_sentiment(article_neutral_removed)
pos_score += pos
neg_score += neg
avg_weekly_pos_score[i] = (pos_score/float(1 if num_articles==0 else num_articles))
avg_weekly_neg_score[i] = (neg_score/float(1 if num_articles==0 else num_articles))
avg_weekly_pos_minus_neg_score[i] = avg_weekly_pos_score[i] - avg_weekly_neg_score[i]
if (i%10 == 0):
print('Week: ', week, 'Postive: ', avg_weekly_pos_score[i][0], 'Negative: ', avg_weekly_neg_score[i][0])
return weeks, avg_weekly_pos_score, avg_weekly_neg_score
dataframe_input = sentiscores_neu_removed(0.9)
# # Saving file to not recalculate every time
# scores_df = pd.DataFrame()
# scores_df['weeks']=dataframe_input[0]
# scores_df['avg_weekly_pos_score'] = dataframe_input[1]
# scores_df['avg_weekly_neg_score'] = dataframe_input[2]
# scores_df.to_csv('scores_df_0.9.csv',index=False)
# Example plot: with the centred running average of ten precedent sentiment scores being used to smoothen sentiment score
# Smoothing parameter
smooth = 20
fig, ax1 = plt.subplots( figsize=(20,15))
ax1.plot(weekly_data['datetime'], weekly_data['XUDLUSS'], '#4b97ab', linewidth=2, label=str('USD/GBP'))
ax1.legend(loc='best', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.set_ylabel('Euro and US Dollar to Pound exchange rate', fontsize=20)
ax1.grid(True)
ax1.set_ylim([min(min(weekly_data['XUDLERS']),min(weekly_data['XUDLUSS'])),max(max(weekly_data['XUDLERS']),max(weekly_data['XUDLUSS']))])
ax1.axvline(x=datetime.datetime(2016,1,8), color='grey', linewidth=2)
ax1.axvline(x=datetime.datetime(2007,1,5), color='orange', linewidth=2)
ax1.axvline(x=datetime.datetime(2009,1,12), color='orange', linewidth=2)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
avg_weekly_pos_score_smooth = running_average_smoother(avg_weekly_pos_score,smooth)
avg_weekly_neg_score_smooth = running_average_smoother(avg_weekly_neg_score,smooth)
ax2 = ax1.twinx()
ax2.plot(timestamp_weeks[smooth-1:], avg_weekly_pos_score_smooth, 'green',linewidth=1, label = 'Weekly positive score')
ax2.plot(timestamp_weeks[smooth-1:], [-1*avg_weekly_neg_score_smooth[k] for k in range(0,len(avg_weekly_neg_score_smooth))], 'red',linewidth=1, label = 'Weekly negative score')
ax2.set_ylabel('Smoothened Sentiment Scores', color='grey',fontsize=20)
ax2.set_ylim([-0.08,0.08])
for tl in ax2.get_yticklabels():
tl.set_color('grey')
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.savefig('sentiment_correlation_pic.png', dpi=200)