Files
argus-nexus/backend/services/news.py
T
2026-03-09 22:07:19 +01:00

269 lines
11 KiB
Python

import httpx
import asyncio
import re
import json
from pathlib import Path
from datetime import datetime, timezone
from xml.etree import ElementTree as ET
CACHE_FILE = Path(__file__).parent.parent / ".cache" / "news.json"
CACHE_DURATION_SEC = 300 # 5 minutes
def _get_cached_news():
if CACHE_FILE.exists() and (datetime.now().timestamp() - CACHE_FILE.stat().st_mtime) < CACHE_DURATION_SEC:
try:
with open(CACHE_FILE, "r") as f:
return json.load(f)
except: pass
return None
def _save_cache(data):
CACHE_FILE.parent.mkdir(exist_ok=True, parents=True)
try:
with open(CACHE_FILE, "w") as f:
json.dump(data, f)
except: pass
FEEDS = [
"http://www.aljazeera.com/xml/rss/all.xml",
"http://feeds.bbci.co.uk/news/world/rss.xml",
"https://www.reutersagency.com/feed/?best-topics=political-general&post_type=best",
"https://www.theguardian.com/world/rss",
"https://feeds.npr.org/1004/rss.xml",
"https://foreignpolicy.com/feed/",
"https://www.cnbc.com/id/100727362/device/rss/rss.html",
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
"https://feeds.washingtonpost.com/rss/world",
]
# Expanded keyword geocoder — covers most geopolitically relevant regions
GEO_DATA = {
# Middle East
"Iran": [32.4279, 53.6880], "Israel": [31.0461, 34.8516],
"Gaza": [31.3547, 34.3088], "West Bank": [31.9466, 35.3027],
"Lebanon": [33.8547, 35.8623], "Syria": [34.8021, 38.9968],
"Yemen": [15.5527, 48.5164], "Iraq": [33.2232, 43.6793],
"Saudi Arabia": [23.8859, 45.0792], "Jordan": [30.5852, 36.2384],
"Kuwait": [29.3117, 47.4818], "Qatar": [25.3548, 51.1839],
"UAE": [23.4241, 53.8478], "Bahrain": [26.0667, 50.5577],
"Oman": [21.5126, 55.9233],
# Europe
"Ukraine": [48.3794, 31.1656], "Russia": [61.5240, 105.3188],
"Germany": [51.1657, 10.4515], "France": [46.2276, 2.2137],
"UK": [55.3781, -3.4360], "Poland": [51.9194, 19.1451],
"Romania": [45.9432, 24.9668], "Finland": [61.9241, 25.7482],
"Sweden": [60.1282, 18.6435], "Norway": [60.4720, 8.4689],
"NATO": [50.8503, 4.3517], "Belarus": [53.7098, 27.9534],
"Moldova": [47.4116, 28.3699], "Georgia": [42.3154, 43.3569],
"Serbia": [44.0165, 20.9129], "Kosovo": [42.6026, 20.9030],
# Asia-Pacific
"China": [35.8617, 104.1954], "Taiwan": [23.6978, 120.9605],
"North Korea": [40.3399, 127.5101], "South Korea": [35.9078, 127.7669],
"Japan": [36.2048, 138.2529], "India": [20.5937, 78.9629],
"Pakistan": [30.3753, 69.3451], "Afghanistan": [33.9391, 67.7100],
"Myanmar": [21.9162, 95.9560], "Philippines": [12.8797, 121.7740],
"Vietnam": [14.0583, 108.2772], "South China Sea": [12.0000, 113.0000],
# Americas
"USA": [37.0902, -95.7129], "Mexico": [23.6345, -102.5528],
"Venezuela": [6.4238, -66.5897], "Colombia": [4.5709, -74.2973],
"Cuba": [21.5218, -77.7812], "Nicaragua": [12.8654, -85.2072],
"Haiti": [18.9712, -72.2852], "Brazil": [14.2350, -51.9253],
"Argentina": [-38.4161, -63.6167], "Chile": [-35.6751, -71.5430],
"Peru": [-9.1900, -75.0152], "Guyana": [4.8604, -58.9302],
# Central Asia & Caucasus
"Kazakhstan": [48.0196, 66.9237], "Azerbaijan": [40.1431, 47.5769],
"Armenia": [40.0691, 45.0382], "Nagorno-Karabakh": [39.8177, 46.7528],
"Uzbekistan": [41.3775, 64.5853], "Kyrgyzstan": [41.2044, 74.7661],
# Specific Conflict Regions & Strategic Spots
"Gaza": [31.3547, 34.3088], "West Bank": [31.9466, 35.3027],
"Donbas": [48.0159, 37.8028], "Kashmir": [34.0837, 74.7973],
"Sudan": [12.8628, 30.2176], "Darfur": [13.4175, 24.3311],
"Tigray": [14.0323, 38.3166], "Somalia": [5.1521, 46.1996],
"Suez Canal": [29.9329, 32.5539], "Panama Canal": [9.1012, -79.6967],
"Bering Strait": [66.0, -169.0], "Malacca": [2.5, 102.0],
# Cities
"New York": [40.7128, -74.0060], "London": [51.5074, -0.1278],
"Paris": [48.8566, 2.3522], "Brussels": [50.8503, 4.3517],
"Geneva": [46.2044, 6.1432], "Vienna": [48.2082, 16.3738],
"Istanbul": [41.0082, 28.9784], "Kyiv": [50.4501, 30.5234],
"Moscow": [55.7558, 37.6173], "Tehran": [35.6892, 51.3890],
"Beijing": [39.9042, 116.4074], "Tokyo": [35.6762, 139.6503],
"Seoul": [37.5665, 126.9780],
}
EXCLUDE_KEYWORDS = [
"sport", "football", "soccer", "la liga", "champions league", "cup", "match",
"olympics", "tennis", "nfl", "nba", "score", "goal", "premier league",
"formula 1", "f1", "golf", "cricket", "rugby", "boxing", "mma",
"celebrity", "oscars", "grammy", "fashion", "movie", "film", "series",
"recipe", "weather forecast", "horoscope"
]
# Strip HTML tags from RSS descriptions
_TAG_RE = re.compile(r'<[^>]+>')
def _strip_html(s: str) -> str:
return _TAG_RE.sub('', s).strip()
def _find_text(el: ET.Element, tag: str) -> str:
"""Find text for a tag, checking common RSS/Atom namespaces."""
node = el.find(tag)
if node is not None and node.text:
return node.text.strip()
# Try with common namespaces
for ns in ['{http://purl.org/dc/elements/1.1/}', '{http://purl.org/rss/1.0/}']:
node = el.find(f'{ns}{tag}')
if node is not None and node.text:
return node.text.strip()
return ''
async def _fetch_single_feed(client: httpx.AsyncClient, feed_url: str) -> list[dict]:
"""Fetch a single RSS feed directly and parse XML items."""
articles: list[dict] = []
try:
resp = await client.get(feed_url, timeout=12.0, follow_redirects=True)
if resp.status_code != 200:
return []
root = ET.fromstring(resp.content)
# Determine feed title
channel = root.find('channel')
feed_title = 'Global Intel'
if channel is not None:
ft = channel.findtext('title')
if ft:
feed_title = ft.strip()
else:
# Atom feed
ft = root.findtext('{http://www.w3.org/2005/Atom}title')
if ft:
feed_title = ft.strip()
# Find items — RSS uses <item>, Atom uses <entry>
items = root.findall('.//item')
if not items:
items = root.findall('.//{http://www.w3.org/2005/Atom}entry')
for item in items[:10]:
title = _find_text(item, 'title')
if not title:
# Atom title
title = item.findtext('{http://www.w3.org/2005/Atom}title') or ''
title = title.strip()
if not title:
continue
if any(kw in title.lower() for kw in EXCLUDE_KEYWORDS):
continue
# Geocode from keywords
lat, lon = None, None
for region, coords in GEO_DATA.items():
if re.search(r'\b' + re.escape(region) + r'\b', title, re.IGNORECASE):
lat, lon = coords
break
# Link
link = _find_text(item, 'link')
if not link:
link_el = item.find('{http://www.w3.org/2005/Atom}link')
if link_el is not None:
link = link_el.get('href', '')
# Description
desc = _find_text(item, 'description') or _find_text(item, 'summary')
if not desc:
desc = item.findtext('{http://www.w3.org/2005/Atom}summary') or ''
desc = _strip_html(desc)[:200]
# Publication date
pub_date = (
_find_text(item, 'pubDate')
or _find_text(item, 'published')
or item.findtext('{http://www.w3.org/2005/Atom}published')
or datetime.now(timezone.utc).isoformat()
)
# Image from enclosure or media:content
image = ''
enc = item.find('enclosure')
if enc is not None:
enc_url = enc.get('url', '')
enc_type = enc.get('type', '')
if 'image' in enc_type or enc_url.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
image = enc_url
if not image:
media = item.find('{http://search.yahoo.com/mrss/}content')
if media is not None:
murl = media.get('url', '')
if murl.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
image = murl
# Determine Category and Severity
title_lower = title.lower()
category = "GEOPOLITICS"
if any(w in title_lower for w in ["cyber", "hacking", "breach", "malware", "botnet"]):
category = "CYBER"
elif any(w in title_lower for w in ["satellite", "orbit", "rocket", "launch", "space", "iss"]):
category = "SPACE"
elif any(w in title_lower for w in ["military", "army", "navy", "airforce", "missile", "strike", "war", "conflict", "nato", "defense"]):
category = "MILITARY"
severity = "MODERATE"
if any(w in title_lower for w in ["attack", "strike", "crisis", "invasion", "nuclear", "killed"]):
severity = "HIGH"
if any(w in title_lower for w in ["critical", "emergency", "declaration", "imminent"]):
severity = "CRITICAL"
articles.append({
"title": title,
"source": feed_title,
"url": link,
"image": image or None,
"lat": lat,
"lon": lon,
"summary": desc or "No details available.",
"published_at": pub_date,
"category": category,
"severity": severity
})
except Exception as e:
print(f"[NEWS] Feed error ({feed_url[:60]}): {e}")
return articles
async def fetch_news():
"""Fetch all RSS feeds in parallel and return combined articles."""
cached = _get_cached_news()
if cached is not None:
return cached
try:
async with httpx.AsyncClient(
headers={"User-Agent": "GodsEye/2.0 RSS Reader"},
) as client:
results = await asyncio.gather(
*[_fetch_single_feed(client, url) for url in FEEDS],
return_exceptions=True,
)
articles: list[dict] = []
for r in results:
if isinstance(r, list):
articles.extend(r)
print(f"[NEWS] Fetched {len(articles)} intelligence items from {len(FEEDS)} feeds.")
_save_cache(articles)
return articles
except Exception as e:
print(f"[NEWS] Critical error: {e}")
# fallback to stale cache if error
if CACHE_FILE.exists():
try:
with open(CACHE_FILE, "r") as f:
return json.load(f)
except: pass
return []