269 lines
11 KiB
Python
269 lines
11 KiB
Python
import httpx
|
|
import asyncio
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from xml.etree import ElementTree as ET
|
|
|
|
CACHE_FILE = Path(__file__).parent.parent / ".cache" / "news.json"
|
|
CACHE_DURATION_SEC = 300 # 5 minutes
|
|
|
|
def _get_cached_news():
|
|
if CACHE_FILE.exists() and (datetime.now().timestamp() - CACHE_FILE.stat().st_mtime) < CACHE_DURATION_SEC:
|
|
try:
|
|
with open(CACHE_FILE, "r") as f:
|
|
return json.load(f)
|
|
except: pass
|
|
return None
|
|
|
|
def _save_cache(data):
|
|
CACHE_FILE.parent.mkdir(exist_ok=True, parents=True)
|
|
try:
|
|
with open(CACHE_FILE, "w") as f:
|
|
json.dump(data, f)
|
|
except: pass
|
|
|
|
FEEDS = [
|
|
"http://www.aljazeera.com/xml/rss/all.xml",
|
|
"http://feeds.bbci.co.uk/news/world/rss.xml",
|
|
"https://www.reutersagency.com/feed/?best-topics=political-general&post_type=best",
|
|
"https://www.theguardian.com/world/rss",
|
|
"https://feeds.npr.org/1004/rss.xml",
|
|
"https://foreignpolicy.com/feed/",
|
|
"https://www.cnbc.com/id/100727362/device/rss/rss.html",
|
|
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
|
|
"https://feeds.washingtonpost.com/rss/world",
|
|
]
|
|
|
|
# Expanded keyword geocoder — covers most geopolitically relevant regions
|
|
GEO_DATA = {
|
|
# Middle East
|
|
"Iran": [32.4279, 53.6880], "Israel": [31.0461, 34.8516],
|
|
"Gaza": [31.3547, 34.3088], "West Bank": [31.9466, 35.3027],
|
|
"Lebanon": [33.8547, 35.8623], "Syria": [34.8021, 38.9968],
|
|
"Yemen": [15.5527, 48.5164], "Iraq": [33.2232, 43.6793],
|
|
"Saudi Arabia": [23.8859, 45.0792], "Jordan": [30.5852, 36.2384],
|
|
"Kuwait": [29.3117, 47.4818], "Qatar": [25.3548, 51.1839],
|
|
"UAE": [23.4241, 53.8478], "Bahrain": [26.0667, 50.5577],
|
|
"Oman": [21.5126, 55.9233],
|
|
# Europe
|
|
"Ukraine": [48.3794, 31.1656], "Russia": [61.5240, 105.3188],
|
|
"Germany": [51.1657, 10.4515], "France": [46.2276, 2.2137],
|
|
"UK": [55.3781, -3.4360], "Poland": [51.9194, 19.1451],
|
|
"Romania": [45.9432, 24.9668], "Finland": [61.9241, 25.7482],
|
|
"Sweden": [60.1282, 18.6435], "Norway": [60.4720, 8.4689],
|
|
"NATO": [50.8503, 4.3517], "Belarus": [53.7098, 27.9534],
|
|
"Moldova": [47.4116, 28.3699], "Georgia": [42.3154, 43.3569],
|
|
"Serbia": [44.0165, 20.9129], "Kosovo": [42.6026, 20.9030],
|
|
# Asia-Pacific
|
|
"China": [35.8617, 104.1954], "Taiwan": [23.6978, 120.9605],
|
|
"North Korea": [40.3399, 127.5101], "South Korea": [35.9078, 127.7669],
|
|
"Japan": [36.2048, 138.2529], "India": [20.5937, 78.9629],
|
|
"Pakistan": [30.3753, 69.3451], "Afghanistan": [33.9391, 67.7100],
|
|
"Myanmar": [21.9162, 95.9560], "Philippines": [12.8797, 121.7740],
|
|
"Vietnam": [14.0583, 108.2772], "South China Sea": [12.0000, 113.0000],
|
|
# Americas
|
|
"USA": [37.0902, -95.7129], "Mexico": [23.6345, -102.5528],
|
|
"Venezuela": [6.4238, -66.5897], "Colombia": [4.5709, -74.2973],
|
|
"Cuba": [21.5218, -77.7812], "Nicaragua": [12.8654, -85.2072],
|
|
"Haiti": [18.9712, -72.2852], "Brazil": [14.2350, -51.9253],
|
|
"Argentina": [-38.4161, -63.6167], "Chile": [-35.6751, -71.5430],
|
|
"Peru": [-9.1900, -75.0152], "Guyana": [4.8604, -58.9302],
|
|
# Central Asia & Caucasus
|
|
"Kazakhstan": [48.0196, 66.9237], "Azerbaijan": [40.1431, 47.5769],
|
|
"Armenia": [40.0691, 45.0382], "Nagorno-Karabakh": [39.8177, 46.7528],
|
|
"Uzbekistan": [41.3775, 64.5853], "Kyrgyzstan": [41.2044, 74.7661],
|
|
# Specific Conflict Regions & Strategic Spots
|
|
"Gaza": [31.3547, 34.3088], "West Bank": [31.9466, 35.3027],
|
|
"Donbas": [48.0159, 37.8028], "Kashmir": [34.0837, 74.7973],
|
|
"Sudan": [12.8628, 30.2176], "Darfur": [13.4175, 24.3311],
|
|
"Tigray": [14.0323, 38.3166], "Somalia": [5.1521, 46.1996],
|
|
"Suez Canal": [29.9329, 32.5539], "Panama Canal": [9.1012, -79.6967],
|
|
"Bering Strait": [66.0, -169.0], "Malacca": [2.5, 102.0],
|
|
# Cities
|
|
"New York": [40.7128, -74.0060], "London": [51.5074, -0.1278],
|
|
"Paris": [48.8566, 2.3522], "Brussels": [50.8503, 4.3517],
|
|
"Geneva": [46.2044, 6.1432], "Vienna": [48.2082, 16.3738],
|
|
"Istanbul": [41.0082, 28.9784], "Kyiv": [50.4501, 30.5234],
|
|
"Moscow": [55.7558, 37.6173], "Tehran": [35.6892, 51.3890],
|
|
"Beijing": [39.9042, 116.4074], "Tokyo": [35.6762, 139.6503],
|
|
"Seoul": [37.5665, 126.9780],
|
|
}
|
|
|
|
EXCLUDE_KEYWORDS = [
|
|
"sport", "football", "soccer", "la liga", "champions league", "cup", "match",
|
|
"olympics", "tennis", "nfl", "nba", "score", "goal", "premier league",
|
|
"formula 1", "f1", "golf", "cricket", "rugby", "boxing", "mma",
|
|
"celebrity", "oscars", "grammy", "fashion", "movie", "film", "series",
|
|
"recipe", "weather forecast", "horoscope"
|
|
]
|
|
|
|
# Strip HTML tags from RSS descriptions
|
|
_TAG_RE = re.compile(r'<[^>]+>')
|
|
|
|
|
|
def _strip_html(s: str) -> str:
|
|
return _TAG_RE.sub('', s).strip()
|
|
|
|
|
|
def _find_text(el: ET.Element, tag: str) -> str:
|
|
"""Find text for a tag, checking common RSS/Atom namespaces."""
|
|
node = el.find(tag)
|
|
if node is not None and node.text:
|
|
return node.text.strip()
|
|
# Try with common namespaces
|
|
for ns in ['{http://purl.org/dc/elements/1.1/}', '{http://purl.org/rss/1.0/}']:
|
|
node = el.find(f'{ns}{tag}')
|
|
if node is not None and node.text:
|
|
return node.text.strip()
|
|
return ''
|
|
|
|
|
|
async def _fetch_single_feed(client: httpx.AsyncClient, feed_url: str) -> list[dict]:
|
|
"""Fetch a single RSS feed directly and parse XML items."""
|
|
articles: list[dict] = []
|
|
try:
|
|
resp = await client.get(feed_url, timeout=12.0, follow_redirects=True)
|
|
if resp.status_code != 200:
|
|
return []
|
|
|
|
root = ET.fromstring(resp.content)
|
|
# Determine feed title
|
|
channel = root.find('channel')
|
|
feed_title = 'Global Intel'
|
|
if channel is not None:
|
|
ft = channel.findtext('title')
|
|
if ft:
|
|
feed_title = ft.strip()
|
|
else:
|
|
# Atom feed
|
|
ft = root.findtext('{http://www.w3.org/2005/Atom}title')
|
|
if ft:
|
|
feed_title = ft.strip()
|
|
|
|
# Find items — RSS uses <item>, Atom uses <entry>
|
|
items = root.findall('.//item')
|
|
if not items:
|
|
items = root.findall('.//{http://www.w3.org/2005/Atom}entry')
|
|
|
|
for item in items[:10]:
|
|
title = _find_text(item, 'title')
|
|
if not title:
|
|
# Atom title
|
|
title = item.findtext('{http://www.w3.org/2005/Atom}title') or ''
|
|
title = title.strip()
|
|
if not title:
|
|
continue
|
|
|
|
if any(kw in title.lower() for kw in EXCLUDE_KEYWORDS):
|
|
continue
|
|
|
|
# Geocode from keywords
|
|
lat, lon = None, None
|
|
for region, coords in GEO_DATA.items():
|
|
if re.search(r'\b' + re.escape(region) + r'\b', title, re.IGNORECASE):
|
|
lat, lon = coords
|
|
break
|
|
|
|
# Link
|
|
link = _find_text(item, 'link')
|
|
if not link:
|
|
link_el = item.find('{http://www.w3.org/2005/Atom}link')
|
|
if link_el is not None:
|
|
link = link_el.get('href', '')
|
|
|
|
# Description
|
|
desc = _find_text(item, 'description') or _find_text(item, 'summary')
|
|
if not desc:
|
|
desc = item.findtext('{http://www.w3.org/2005/Atom}summary') or ''
|
|
desc = _strip_html(desc)[:200]
|
|
|
|
# Publication date
|
|
pub_date = (
|
|
_find_text(item, 'pubDate')
|
|
or _find_text(item, 'published')
|
|
or item.findtext('{http://www.w3.org/2005/Atom}published')
|
|
or datetime.now(timezone.utc).isoformat()
|
|
)
|
|
|
|
# Image from enclosure or media:content
|
|
image = ''
|
|
enc = item.find('enclosure')
|
|
if enc is not None:
|
|
enc_url = enc.get('url', '')
|
|
enc_type = enc.get('type', '')
|
|
if 'image' in enc_type or enc_url.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
|
|
image = enc_url
|
|
if not image:
|
|
media = item.find('{http://search.yahoo.com/mrss/}content')
|
|
if media is not None:
|
|
murl = media.get('url', '')
|
|
if murl.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
|
|
image = murl
|
|
|
|
# Determine Category and Severity
|
|
title_lower = title.lower()
|
|
category = "GEOPOLITICS"
|
|
if any(w in title_lower for w in ["cyber", "hacking", "breach", "malware", "botnet"]):
|
|
category = "CYBER"
|
|
elif any(w in title_lower for w in ["satellite", "orbit", "rocket", "launch", "space", "iss"]):
|
|
category = "SPACE"
|
|
elif any(w in title_lower for w in ["military", "army", "navy", "airforce", "missile", "strike", "war", "conflict", "nato", "defense"]):
|
|
category = "MILITARY"
|
|
|
|
severity = "MODERATE"
|
|
if any(w in title_lower for w in ["attack", "strike", "crisis", "invasion", "nuclear", "killed"]):
|
|
severity = "HIGH"
|
|
if any(w in title_lower for w in ["critical", "emergency", "declaration", "imminent"]):
|
|
severity = "CRITICAL"
|
|
|
|
articles.append({
|
|
"title": title,
|
|
"source": feed_title,
|
|
"url": link,
|
|
"image": image or None,
|
|
"lat": lat,
|
|
"lon": lon,
|
|
"summary": desc or "No details available.",
|
|
"published_at": pub_date,
|
|
"category": category,
|
|
"severity": severity
|
|
})
|
|
except Exception as e:
|
|
print(f"[NEWS] Feed error ({feed_url[:60]}): {e}")
|
|
return articles
|
|
|
|
|
|
async def fetch_news():
|
|
"""Fetch all RSS feeds in parallel and return combined articles."""
|
|
cached = _get_cached_news()
|
|
if cached is not None:
|
|
return cached
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
headers={"User-Agent": "GodsEye/2.0 RSS Reader"},
|
|
) as client:
|
|
results = await asyncio.gather(
|
|
*[_fetch_single_feed(client, url) for url in FEEDS],
|
|
return_exceptions=True,
|
|
)
|
|
articles: list[dict] = []
|
|
for r in results:
|
|
if isinstance(r, list):
|
|
articles.extend(r)
|
|
|
|
print(f"[NEWS] Fetched {len(articles)} intelligence items from {len(FEEDS)} feeds.")
|
|
_save_cache(articles)
|
|
return articles
|
|
except Exception as e:
|
|
print(f"[NEWS] Critical error: {e}")
|
|
# fallback to stale cache if error
|
|
if CACHE_FILE.exists():
|
|
try:
|
|
with open(CACHE_FILE, "r") as f:
|
|
return json.load(f)
|
|
except: pass
|
|
return []
|