#import required libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import pandas as pd
import random
from datetime import datetime,timedelta
import requests
import json
import re
import time
from urllib.parse import quote
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from plotly.subplots import make_subplots
import plotly.graph_objects as go

#load your personal API key
with open('../../API key/GuardianAPIkey.txt', 'r') as file:
    key = file.read().strip()
print(f'Length of the API key: {len(key)} characters')

#build a search URL
base_url = 'https://content.guardianapis.com/'
file_path_store_data = "data/"

Length of the API key: 36 characters

# Fetch search result pages from url
def fetch_data(search_string,office,from_date,to_date):
    full_url = base_url+f"search?q={quote(search_string)}&production-office={production_office}&from-date={from_date}&show-fields=body&api-key={key}&show-tags=keyword"
    print(f"Fetching contents related to '{search_string}' from production office {office} from {from_date} to {to_date}")
    server_response = requests.get(full_url)
    server_data = server_response.json()
    resp_data = server_data.get('response','')
    if resp_data == '':
        print("ERROR obtaining results:",server_data)
        return False
    else:
        print("SUCCESS!")
        print(f"{resp_data['total']} results found available")
        results = resp_data.get('results',[])
        articles = get_all_articles_for_response(resp_data,full_url)
        file_name = f"{search_string}_articles.json"
        with open(f"{file_path_store_data}{file_name}",'w', encoding='utf-8') as fp:
            fp.write(json.dumps(articles))
            print(f"Data saved to {file_path_store_data}{file_name}")    
        return articles

def articles_from_page_results(page_results):
    articles = {}
    for result in page_results:
        article_date = result['webPublicationDate']
        article_title = result['webTitle']+f" [{article_date}]"
        article_html = result['fields']['body']
        article_type = result['type']
        article_sectionName= result['sectionName']
        article_keywords=set()
        for tag in result['tags']: 
            article_keywords.update(tag["id"].split('/'))
        article_keywords=list(article_keywords)
        article_text = re.sub(r'<.*?>','',article_html)
        article_text = re.sub(r'\n','',article_text)

        articles[article_title] = {"type":article_type,"section":article_sectionName,"keywords":article_keywords,"text":article_text}
    return articles
     


def get_all_articles_for_response(response_json,full_url):
    total_pages = response_json['pages']
    total_articles = response_json['total']
    print(f"Fetching {total_articles} articles from {total_pages} pages...")
    all_articles = {}
    page1_articles = articles_from_page_results(response_json['results'])
    all_articles.update(page1_articles)
    
    for page in range(2,total_pages+1):
        page_response = requests.get(full_url+f"&page={page}")
        page_data = page_response.json()['response']
        page_articles = articles_from_page_results(page_data['results'])
        all_articles.update(page_articles)
        print(f"Status: {len(all_articles)} articles fetched.")
        time.sleep(1) # make sure we're not hitting the API to hard
    
    print(f"FINISHED: Fetched {len(all_articles)} articles.")
    return all_articles

def print_random_number_titles(articles, number):
    indices = random.sample(range(len(articles)), number)
    keys_list=list(articles.keys())
    for idx in indices:
        print(keys_list[idx])

today = datetime.today().date() 
seven_days_ago = today - timedelta(days=7)

search_string = "Australia"
production_office = "uk & us"
from_date = str(seven_days_ago)
to_date=str(today)

############## Uncomment on weekly report ##########################
# fetch_data(search_string,production_office,from_date,to_date)
############## Uncomment on weekly report ##########################

file_name = f"{search_string}_articles.json"

with open(f"{file_path_store_data}{file_name}",'r', encoding='utf-8') as fp:
    australia_articles = json.load(fp)


print(f"=====Random article titles from {len(australia_articles)} articles=====")
print_random_number_titles(australia_articles,min(5, len(australia_articles)))

=====Random article titles from 57 articles=====
MPs are voting on the next stage of the assisted dying bill. This is their chance to create a legacy | Polly Toynbee [2025-05-15T17:00:31Z]
‘Cinema doesn’t ship that way’: Wes Anderson mocks Donald Trump’s film tariff plans in Cannes [2025-05-19T14:48:58Z]
‘My sadness is not a burden’: author Yiyun Li on the suicide of both her sons [2025-05-17T08:01:00Z]
Twenty years later: how 2005 Ashes marked end of cricket as we knew it [2025-05-17T07:00:59Z]
‘Proving people wrong’: how Central Coast Mariners reached A-League Women grand final [2025-05-15T10:29:32Z]

search_string = "Queensland OR Brisbane OR \"Gold Coast\" OR \"Surfers Paradise\""
production_office = "uk & us"
from_date = str(seven_days_ago)
to_date=str(today)

############## Uncomment on weekly report ##########################
# QorGC_articles=fetch_data(search_string,production_office,from_date,to_date)
############## Uncomment on weekly report ##########################

file_name = f"{search_string}_articles.json"

with open(f"{file_path_store_data}{file_name}",'r', encoding='utf-8') as fp:
    QorGC_articles = json.load(fp)

print(f"=====Random article titles from {len(QorGC_articles)} articles=====")
print_random_number_titles(QorGC_articles,min(5, len(QorGC_articles)))

=====Random article titles from 4 articles=====
Online dating advice: five ways to stay safe, according to the experts [2025-05-14T14:04:00Z]
St Helens find hope and a new hero in seven-try rout of Catalans Dragons [2025-05-15T21:02:03Z]
Postecoglou adamant work at Spurs is not done but sounds resigned to his fate [2025-05-20T18:52:53Z]
Richard Goodman obituary [2025-05-18T15:32:27Z]

search_string = "(beach OR beaches OR coast OR coasts) AND (travels OR travel OR tour OR tourist OR tourism OR trip OR holiday OR vacation)"
production_office = "uk & us & aus"
from_date = str(seven_days_ago)
to_date=str(today)

############## Uncomment on weekly report ##########################
# fetch_data(search_string,production_office,from_date,to_date)
############## Uncomment on weekly report ##########################

file_name = f"{search_string}_articles.json"

with open(f"{file_path_store_data}{file_name}",'r', encoding='utf-8') as fp:
    beach_articles = json.load(fp)

print(f"=====Random article titles from {len(beach_articles)} articles=====")
print_random_number_titles(beach_articles,min(5, len(beach_articles)))

=====Random article titles from 8 articles=====
What Donald Trump did this week should terrify Benjamin Netanyahu. This is why | Jonathan Freedland [2025-05-16T15:44:57Z]
The Guardian’s happiest places to live in Britain revealed [2025-05-17T11:00:05Z]
Share a tip on a great dog-friendly holiday [2025-05-19T14:51:03Z]
‘Time slows down in Lastovo’: I may just have found Croatia’s most unspoilt archipelago [2025-05-14T06:00:01Z]
Thinking of a trip to Barcelona this summer? Beware – here’s what you'll find | Stephen Burgen [2025-05-20T04:00:47Z]

def remove_liveblog(dataset):
    remove_count=0
    for article in list(dataset.keys()):
        if dataset[article]["type"]=="liveblog":
            dataset.pop(article)
            remove_count+=1
            print(article)
    print(f"{remove_count} liveblog(s) removed")


def remove_aggregator(dataset):
    remove_count=0
    for article in list(dataset.keys()):
        if "Morning Mail:" in article or "Afternoon Update:" in article or "briefing:" in article:
            dataset.pop(article)
            remove_count+=1
            print(article)
    print(f"{remove_count} aggregator(s) removed")

def remove_media(dataset):
    remove_count=0
    key_to_remove=[]
    for key,article in dataset.items():
        if article["section"]=="Media" or article["section"]=="GNM press office":
            key_to_remove.append(key)
    for key in key_to_remove:
        dataset.pop(key)
        remove_count+=1
        print(key)
    print(f"{remove_count} media article(s) removed")

list_of_datasets=[australia_articles,QorGC_articles,beach_articles]

for dataset in list_of_datasets:
    remove_liveblog(dataset)
    remove_aggregator(dataset)
    remove_media(dataset)
    print()

Eurovision song contest 2025 – as it happened [2025-05-17T23:38:40Z]
Poland’s presidential candidates seek to broaden appeal on campaign trail after nail-biting first round vote – as it happened [2025-05-19T12:22:10Z]
UK, France and Canada threaten action if Israel’s offensive continues as first aid crosses into Gaza in weeks – as it happened [2025-05-19T19:08:31Z]
3 liveblog(s) removed
Friday briefing: The deepening turmoil over the assisted dying bill  [2025-05-16T05:38:13Z]
1 aggregator(s) removed
0 media article(s) removed

0 liveblog(s) removed
0 aggregator(s) removed
0 media article(s) removed

0 liveblog(s) removed
0 aggregator(s) removed
0 media article(s) removed

def create_dataframe(dataset):
    keywords_list=[]
    section=[]
    for article in list(dataset.values()):
        keywords_list.append(article["keywords"])
        section.append(article["section"])        
    terms_df = pd.DataFrame({
        "keywords": keywords_list,
        "section": section
    },index=dataset.keys())
    return terms_df

australia_terms_df=create_dataframe(australia_articles)
QorGC_terms_df=create_dataframe(QorGC_articles)
beach_terms_df=create_dataframe(beach_articles)

fig = px.bar(
    x=australia_terms_df["section"].value_counts().index,
    y=australia_terms_df["section"].value_counts().values,
    labels={"x": "Editor Section", "y": "Number of articles"},
    title="Number of Articles by Editor Section related to Australia"
)

fig.update_layout(xaxis_tickangle=-45)
fig.show()

# Define own stop words
custom_stop_words = {'guardian', 'amp', 'nbsp','said','says','say','news','ve','think','like','did','didn','don','does','doesn','do'}

extended_stop_words = list(ENGLISH_STOP_WORDS.union(custom_stop_words))

tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=3, max_features=100000, stop_words=extended_stop_words)
num_topics = 12

australia_articles_text=[article["text"] for article in australia_articles.values() ]


tfidf_dt_matrix=tfidf_vectorizer.fit_transform(australia_articles_text)
feature_names = tfidf_vectorizer.get_feature_names_out()
doc001_term_counts = list(zip(feature_names,tfidf_dt_matrix))

nmf_model = NMF(n_components=num_topics,init='random',beta_loss='frobenius')

doc_topic_nmf = nmf_model.fit_transform(tfidf_dt_matrix)

topic_term_nmf = nmf_model.components_
nmf_topic_dict = {}
for index, topic in enumerate(topic_term_nmf):
    zipped = zip(feature_names, topic)
    top_terms=dict(sorted(zipped, key = lambda t: t[1], reverse=True)[:10])
    #print(top_terms)
    top_terms_list= {key : round(top_terms[key], 4) for key in top_terms.keys()}
    nmf_topic_dict[f"topic_{index}"] = top_terms_list

topic_num_list = []
topic_term = []

for idx, topic in enumerate(doc_topic_nmf):
    topic_num = topic.argmax()
    topic_num_list.append(topic_num)
    top_topic = nmf_topic_dict[f"topic_{topic_num}"]
    topic_term.append(top_topic)
australia_terms_df['nmfTopic'] = topic_num_list
australia_terms_df['nmf'] = topic_term

terms_group = australia_terms_df.groupby('nmfTopic').size().reset_index(name='Article Count')

terms_group.rename(columns={'nmfTopic': 'Topic Number'}, inplace=True)

fig1 = px.bar(terms_group,
             x='Topic Number',
             y='Article Count',
             title='Number of Articles per NMF Topic related to Australia',
             labels={'Topic Number': 'Topic Number', 'Article Count': 'Article Count'},
             text='Article Count')

fig1.update_layout(xaxis=dict(type='category'))  # ensures discrete topic numbers
fig1.show()

topics = list(nmf_topic_dict.items())
n_topics = len(topics)
n_cols = 3
n_rows = (n_topics + n_cols - 1) // n_cols  # Ceiling division

# Find the global max value for consistent x-axis range
max_val = max(max(terms.values()) for _, terms in topics)

fig2 = make_subplots(
    rows=n_rows,
    cols=n_cols,
    subplot_titles=[f"{topic.replace('_', ' ').title()}" for topic, _ in topics]
)

for idx, (topic, terms) in enumerate(topics):
    row = idx // n_cols + 1
    col = idx % n_cols + 1

    fig2.add_trace(
        go.Bar(
            x=list(terms.values()),
            y=list(terms.keys()),
            orientation='h',
            text=list(terms.values()),
            name=topic,
            showlegend=False
        ),
        row=row, col=col
    )
    fig2.update_yaxes(autorange='reversed', row=row, col=col)
    fig2.update_xaxes(range=[0, max_val], row=row, col=col)  # Fix x-axis range

fig2.update_layout(
    height=350 * n_rows,
    width=1200,
    title_text="Top 10 Terms per Topic Related to Australia",
    showlegend=False
)

fig2.show()

fig3 = px.histogram(australia_terms_df, 
                    x="nmfTopic", 
                    y="section", 
                    facet_col="nmfTopic", 
                    histfunc="count",  
                    labels={
                        "nmfTopic": "Topic",
                        "section": "Section",
                        "count": "Topic Count"
                    },
                    category_orders={"nmfTopic": range(num_topics)}) 

fig3.update_layout(
    title="Sections Distribution Across Topics related to Australia", 
    xaxis_title="count",
    yaxis_title="Section",
    showlegend=False 
)

fig3.show()

grouped_by_topic = australia_terms_df.groupby('nmfTopic')

samples1 = [group.sample(n=1) for _, group in grouped_by_topic]

samples1 = [sample.iloc[0] for sample in samples1]

for doc in samples1:
    print(f"NMF Topic {doc['nmfTopic']}: ")
    print(f"\t>> Title:\t {doc.name}")
    print("\t>> Section:\t", doc['section'])
    print("\t>> Keywords:\t", doc['keywords'])
    print("\t>> NMF terms:\t", f"Topic {doc['nmfTopic']}: ", list(doc['nmf'].keys())[:5])
    print()

NMF Topic 0: 
	>> Title:	 The Spin | Gunnersbury women’s cricket club celebrate hitting historic century [2025-05-14T09:20:59Z]
	>> Section:	 Sport
	>> Keywords:	 ['sport', 'cricket']
	>> NMF terms:	 Topic 0:  ['women', 'league', 'victory', 'rugby', 'season']

NMF Topic 1: 
	>> Title:	 UK urged not to exploit poor countries in rush for critical minerals [2025-05-14T23:01:09Z]
	>> Section:	 Business
	>> Keywords:	 ['green-politics', 'mining', 'environment', 'business', 'uk', 'commodities']
	>> NMF terms:	 Topic 1:  ['russia', 'ukraine', 'australian', 'rights', 'justice']

NMF Topic 2: 
	>> Title:	 ‘Extreme anxiety and extreme depression’: Jennifer Lawrence says she felt ‘like an alien’ as a new mother [2025-05-18T11:39:54Z]
	>> Section:	 Film
	>> Keywords:	 ['lifeandstyle', 'postnatal-depression', 'film', 'mental-health', 'lynne-ramsay', 'festivals', 'parents-and-parenting', 'culture', 'robert-pattinson', 'society', 'cannesfilmfestival']
	>> NMF terms:	 Topic 2:  ['film', 'trump', 'cannes', 'movie', 'anderson']

NMF Topic 3: 
	>> Title:	 Here comes summer: reasons to love riesling [2025-05-15T12:00:11Z]
	>> Section:	 Food
	>> Keywords:	 ['food', 'australian-food-and-drink', 'wine', 'german-food-and-drink']
	>> NMF terms:	 Topic 3:  ['wine', 'richard', 'zealand', 'dry', 'sure']

NMF Topic 4: 
	>> Title:	 EU reset deal puts Britain back on the world stage, says Keir Starmer [2025-05-19T18:31:04Z]
	>> Section:	 Politics
	>> Keywords:	 ['keir-starmer', 'europe-news', 'ursula-von-der-leyen', 'foreignpolicy', 'politics', 'uk', 'eu', 'eu-referendum', 'world']
	>> NMF terms:	 Topic 4:  ['eu', 'deal', 'uk', 'starmer', 'british']

NMF Topic 5: 
	>> Title:	 Needless controversy over foreign-born Lions players ramps up pressure [2025-05-19T17:00:33Z]
	>> Section:	 Sport
	>> Keywords:	 ['british-irish-lions', 'sport', 'rugby-union']
	>> NMF terms:	 Topic 5:  ['lions', 'players', 'jones', 'ireland', 'farrell']

NMF Topic 6: 
	>> Title:	 Young British woman held on drug charges in Sri Lanka could be linked to Culley case [2025-05-19T17:46:09Z]
	>> Section:	 World news
	>> Keywords:	 ['srilanka', 'thailand', 'drugs', 'south-and-central-asia', 'uk', 'society', 'cannabis', 'asia-pacific', 'world']
	>> NMF terms:	 Topic 6:  ['phone', 'children', 'writing', 'life', 'people']

NMF Topic 7: 
	>> Title:	 Gods arrive from India, myths grow Tinguely and meat gets sensual – the week in art [2025-05-16T11:00:53Z]
	>> Section:	 Art and design
	>> Keywords:	 ['artanddesign', 'painting', 'exhibition', 'photography', 'culture', 'art']
	>> NMF terms:	 Topic 7:  ['art', 'single', 'stakes', 'era', 'pop']

NMF Topic 8: 
	>> Title:	 England expect most players will choose country over IPL for West Indies ODIs [2025-05-13T17:46:13Z]
	>> Section:	 Sport
	>> Keywords:	 ['england-cricket-team', 'sport', 'cricket']
	>> NMF terms:	 Topic 8:  ['cricket', 'kohli', 'test', 'england', 'zimbabwe']

NMF Topic 9: 
	>> Title:	 UK ‘the sick person of the wealthy world’ amid increase in deaths from drugs and violence [2025-05-19T23:01:41Z]
	>> Section:	 Society
	>> Keywords:	 ['heart-disease', 'drugs', 'uk', 'society', 'cancer', 'health']
	>> NMF terms:	 Topic 9:  ['health', 'mental', 'uk', 'people', 'deaths']

NMF Topic 10: 
	>> Title:	 Royal College of Psychiatrists says it cannot yet support assisted dying bill [2025-05-14T13:08:45Z]
	>> Section:	 Society
	>> Keywords:	 ['psychiatry', 'assisted-suicide', 'wales', 'politics', 'uk', 'law', 'uk-news', 'society', 'england', 'houseofcommons', 'health']
	>> NMF terms:	 Topic 10:  ['assisted', 'dying', 'mps', 'psychiatrists', 'college']

NMF Topic 11: 
	>> Title:	 Austrians celebrate JJ bringing home first Eurovision win in 11 years [2025-05-18T16:35:46Z]
	>> Section:	 Television & radio
	>> Keywords:	 ['austria', 'europe-news', 'eurovision', 'tv-and-radio', 'culture', 'eurovision-2025', 'music', 'world']
	>> NMF terms:	 Topic 11:  ['eurovision', 'song', 'contest', 'jj', 'entry']

text = australia_articles[samples1[9].name]["text"]

words_to_highlight = ['Australia']

def highlight_words_in_html(text, words):
    for word in words:
        highlighted_word = rf'<span style="background-color: yellow; font-weight: bold;">{word}</span>'
        text = re.sub(rf'({word})', highlighted_word, text, flags=re.IGNORECASE)
    
    html_content = f"""
            
            <p>{text}</p>

    """
    
    return html_content

html_content = highlight_words_in_html(text, words_to_highlight)

HTML(html_content)

def topics_by_tfidf(dataset,term_dataframe,max_df,min_df,max_features):
    tfidf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words=extended_stop_words)
    text=[]
    for article in dataset.values():
        text.append(article["text"])
    tfidf_dt_matrix=tfidf_vectorizer.fit_transform(text)
    tfidf_dt_matrix.toarray()
    feature_names = tfidf_vectorizer.get_feature_names_out()

    tfidf_df = pd.DataFrame(tfidf_dt_matrix.toarray(), index=dataset.keys(), columns=feature_names)
    tfidf_df
    term_dataframe['tfidf'] = None
    for idx in term_dataframe.index:
        tfidf = dict(tfidf_df.loc[idx].sort_values(ascending=False).head(5))
        term_dataframe.at[idx,'tfidf'] = list(tfidf.keys()) 
    
topics_by_tfidf(QorGC_articles,QorGC_terms_df,0.5,1,200)
QorGC_terms_df

samples = random.sample(range(0, len(QorGC_terms_df)), min(5, len(QorGC_terms_df)))

for sample in samples:
    doc = QorGC_terms_df.iloc[sample]
    print(f"[{sample}] {doc.name}")
    print("\t>> Section:\t",doc['section'])
    print("\t>> Keywords:\t",doc['keywords'])
    # print("\t>> NMF terms:\t",f"Topic {doc['nmfTopic']}: ", list(doc['nmf'].keys())[:5])
    print("\t>> TFIDF:\t",doc['tfidf'])
    print()

[2] Postecoglou adamant work at Spurs is not done but sounds resigned to his fate [2025-05-20T18:52:53Z]
	>> Section:	 Football
	>> Keywords:	 ['uefa-europa-league', 'ange-postecoglou', 'europeanfootball', 'australia-sport', 'football', 'sport', 'tottenham-hotspur']
	>> TFIDF:	 ['postecoglou', 'future', 'spurs', 'final', 'll']

[0] Richard Goodman obituary [2025-05-18T15:32:27Z]
	>> Section:	 Food
	>> Keywords:	 ['london', 'uk', 'newzealand', 'wine', 'world', 'food', 'australia-news']
	>> TFIDF:	 ['wine', 'richard', 'new', 'zealand', 'joan']

[3] Online dating advice: five ways to stay safe, according to the experts [2025-05-14T14:04:00Z]
	>> Section:	 The Filter
	>> Keywords:	 ['dating', 'technology', 'internet-safety', 'lifeandstyle', 'apps', 'tinder', 'relationships']
	>> TFIDF:	 ['dating', 'app', 'apps', 'online', 'red']

[1] St Helens find hope and a new hero in seven-try rout of Catalans Dragons [2025-05-15T21:02:03Z]
	>> Section:	 Sport
	>> Keywords:	 ['rugbyleague', 'catalans', 'superleague', 'sport', 'sthelens']
	>> TFIDF:	 ['whitby', 'wellens', 'saints', 'year', 'pressure']

fig4 = px.bar(
    x=beach_terms_df["section"].value_counts().index,
    y=beach_terms_df["section"].value_counts().values,
    labels={"x": "Section", "y": "Number"},
    title="Dataset C Beach: Number of Articles by Section"
)

fig4.update_layout(xaxis_tickangle=-45)
fig4.show()

tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=1, max_features=100000, stop_words=extended_stop_words)
num_topics = 5

beach_articles_text=[article["text"] for article in beach_articles.values() ]

tfidf_dt_matrix=tfidf_vectorizer.fit_transform(beach_articles_text)
feature_names = tfidf_vectorizer.get_feature_names_out()
doc001_term_counts = list(zip(feature_names,tfidf_dt_matrix))

nmf_model = NMF(n_components=num_topics,init='random',beta_loss='frobenius')

doc_topic_nmf = nmf_model.fit_transform(tfidf_dt_matrix)

topic_term_nmf = nmf_model.components_
nmf_topic_dict = {}
for index, topic in enumerate(topic_term_nmf):
    zipped = zip(feature_names, topic)
    top_terms=dict(sorted(zipped, key = lambda t: t[1], reverse=True)[:10])
    top_terms_list= {key : round(top_terms[key], 4) for key in top_terms.keys()}
    nmf_topic_dict[f"topic_{index}"] = top_terms_list

topic_num_list = []
topic_term = []

for idx, topic in enumerate(doc_topic_nmf):
    topic_num = topic.argmax()
    topic_num_list.append(topic_num)
    top_topic = nmf_topic_dict[f"topic_{topic_num}"]
    topic_term.append(top_topic)
beach_terms_df['nmfTopic'] = topic_num_list
beach_terms_df['nmf'] = topic_term

terms_group = beach_terms_df.groupby('nmfTopic').size().reset_index(name='Article Count')

terms_group.rename(columns={'nmfTopic': 'Topic Number'}, inplace=True)

fig5 = px.bar(terms_group,
             x='Topic Number',
             y='Article Count',
             title='Number of Articles per NMF Topic',
             labels={'Topic Number': 'Topic Number', 'Article Count': 'Article Count'},
             text='Article Count')

fig5.update_layout(xaxis=dict(type='category'))  # ensures discrete topic numbers
fig5.show()


topics = list(nmf_topic_dict.items())
n_topics = len(topics)
n_cols = 3
n_rows = (n_topics + n_cols - 1) // n_cols  # Ceiling division

max_val = max(max(terms.values()) for _, terms in topics)

fig6 = make_subplots(
    rows=n_rows,
    cols=n_cols,
    subplot_titles=[f"Top 10 Terms for {topic.replace('_', ' ').title()}" for topic, _ in topics]
)

for idx, (topic, terms) in enumerate(topics):
    row = idx // n_cols + 1
    col = idx % n_cols + 1

    fig6.add_trace(
        go.Bar(
            x=list(terms.values()),
            y=list(terms.keys()),
            orientation='h',
            text=list(terms.values()),
            name=topic,
            showlegend=False
        ),
        row=row, col=col
    )
    fig6.update_yaxes(autorange='reversed', row=row, col=col)
    fig6.update_xaxes(range=[0, max_val], row=row, col=col)

fig6.update_layout(
    height=350 * n_rows,
    width=1200,
    title_text="Top 10 Terms per topic related to beaches",
    showlegend=False
)

fig6.show()

fig7 = px.histogram(beach_terms_df, 
                    x="nmfTopic", 
                    y="section", 
                    facet_col="nmfTopic", 
                    histfunc="count",  
                    labels={
                        "nmfTopic": "Topic",
                        "section": "Section",
                        "count": "Topic Count"
                    },
                    category_orders={"nmfTopic": range(num_topics)})  

fig7.update_layout(
    title="Section Distribution across Topics related to beaches",  
    xaxis_title="count",
    yaxis_title="Section",
    showlegend=False 
)

fig7.show()

grouped_by_topic = beach_terms_df.groupby('nmfTopic')

samples = [group.sample(n=1) for _, group in grouped_by_topic]

samples = [sample.iloc[0] for sample in samples]

for doc in samples:
    print(f"NMF Topic {doc['nmfTopic']}: ")
    print(f"\t>> Title:\t {doc.name}")
    print("\t>> Section:\t", doc['section'])
    print("\t>> Keywords:\t", doc['keywords'])
    print("\t>> NMF terms:\t", f"Topic {doc['nmfTopic']}: ", list(doc['nmf'].keys())[:5])
    print()

NMF Topic 0: 
	>> Title:	 The Guardian’s happiest places to live in Britain revealed [2025-05-17T11:00:05Z]
	>> Section:	 Life and style
	>> Keywords:	 ['wales', 'lifeandstyle', 'uk', 'happiness', 'property', 'britishidentity', 'scotland', 'money', 'communities', 'cities', 'housing', 'society']
	>> NMF terms:	 Topic 0:  ['town', 'muxima', 'kiel', 'boys', '000']

NMF Topic 1: 
	>> Title:	 First Thing: Trump agrees deal for UAE to build largest AI campus outside US [2025-05-16T12:30:06Z]
	>> Section:	 US news
	>> Keywords:	 ['us-news']
	>> NMF terms:	 Topic 1:  ['trump', 'israel', 'israeli', 'gaza', 'netanyahu']

NMF Topic 2: 
	>> Title:	 Share a tip on a great dog-friendly holiday [2025-05-19T14:51:03Z]
	>> Section:	 Travel
	>> Keywords:	 ['travel']
	>> NMF terms:	 Topic 2:  ['competition', 'tips', 'terms', 'words', 'uk']

NMF Topic 3: 
	>> Title:	 Thinking of a trip to Barcelona this summer? Beware – here’s what you'll find | Stephen Burgen [2025-05-20T04:00:47Z]
	>> Section:	 Opinion
	>> Keywords:	 ['travel', 'barcelona', 'overtourism', 'spain', 'news', 'world', 'europe-news']
	>> NMF terms:	 Topic 3:  ['barcelona', 'tourism', 'tourists', 'city', 'spain']

NMF Topic 4: 
	>> Title:	 ‘Time slows down in Lastovo’: I may just have found Croatia’s most unspoilt archipelago [2025-05-14T06:00:01Z]
	>> Section:	 Travel
	>> Keywords:	 ['travel', 'europe', 'environment', 'birds', 'croatia', 'wildlife']
	>> NMF terms:	 Topic 4:  ['lastovo', 'zaklopatica', 'bay', 'night', 'coast']

SCENARIO¶

Scenario details¶

A. Question¶

B. Data¶

Using news articles to get the trends¶

Ethical considerations in data source¶

Use The Guardian as the source of data¶

Ethical considerations in news source¶

How to use Gurdian API¶

Consideration when applying "Production Office" filter¶

Searching data to answer the questions¶

Import all required libraries¶

Define constants¶

Fetch Dataset A Views about Australia¶

Fetch Dataset B Views about Queensland and Gold Coast¶

Fetch Dataset C Discussion about beaches¶

C. Analysis and Visualization¶

Data cleaning¶

Ethical considerations¶

Create terms dataframe for each dataset¶

Perform topic modelling¶

Dataset A: Views About Australia¶

Non-negative matrix factorization topic modelling¶

Customize stop words¶

Visualizing the NMF Topic modelling Results¶

Cross reference the NMF results with section assigned by editor¶

Limitations¶

Qualitative analysis to the topic¶

Ethical considerations¶

Summary: Discussions on Australia in International Context¶

Dataset B Views about Queensland and Gold Coast¶

Dataset C Discussion about beaches¶

D. Insight¶

What are the recent discussions on the destination that might draw customers or potential customers attention?¶

What are the current travel preferences among beach-goers that can help design travel packages or adjust the tours?¶

Limitations of insights¶

Ethical considerations¶

Reference¶

	keywords	section	tfidf
Richard Goodman obituary [2025-05-18T15:32:27Z]	[london, uk, newzealand, wine, world, food, au...	Food	[wine, richard, new, zealand, joan]
St Helens find hope and a new hero in seven-try rout of Catalans Dragons [2025-05-15T21:02:03Z]	[rugbyleague, catalans, superleague, sport, st...	Sport	[whitby, wellens, saints, year, pressure]
Postecoglou adamant work at Spurs is not done but sounds resigned to his fate [2025-05-20T18:52:53Z]	[uefa-europa-league, ange-postecoglou, europea...	Football	[postecoglou, future, spurs, final, ll]
Online dating advice: five ways to stay safe, according to the experts [2025-05-14T14:04:00Z]	[dating, technology, internet-safety, lifeands...	The Filter	[dating, app, apps, online, red]

SCENARIO¶

Scenario details¶

A. Question¶

B. Data¶

Using news articles to get the trends¶

Ethical considerations in data source¶

Use The Guardian as the source of data¶

Ethical considerations in news source¶

How to use Gurdian API¶

Consideration when applying "Production Office" filter¶

Searching data to answer the questions¶

Import all required libraries¶

Define constants¶

Fetch articles and related information from The Guardian API¶

Fetch Dataset A Views about Australia¶

Fetch Dataset B Views about Queensland and Gold Coast¶

Fetch Dataset C Discussion about beaches¶

C. Analysis and Visualization¶

Data cleaning¶

Ethical considerations¶

Create terms dataframe for each dataset¶

Perform topic modelling¶

Dataset A: Views About Australia¶

Non-negative matrix factorization topic modelling¶

Customize stop words¶

Visualizing the NMF Topic modelling Results¶

Cross reference the NMF results with section assigned by editor¶

Interpretation of Topic Modeling Results Related to Australia¶

Limitations¶

Qualitative analysis to the topic¶

Ethical considerations¶

Summary: Discussions on Australia in International Context¶

Dataset B Views about Queensland and Gold Coast¶

Dataset C Discussion about beaches¶

D. Insight¶

What are the recent discussions on the destination that might draw customers or potential customers attention?¶

What are the current travel preferences among beach-goers that can help design travel packages or adjust the tours?¶

Limitations of insights¶

Ethical considerations¶

Reference¶