argus-nexus/backend/services/news.py

import httpx
import asyncio
import re
import json
from pathlib import Path
from datetime import datetime, timezone
from xml.etree import ElementTree as ET

CACHE_FILE = Path(__file__).parent.parent / ".cache" / "news.json"
CACHE_DURATION_SEC = 300 # 5 minutes

def _get_cached_news():
    if CACHE_FILE.exists() and (datetime.now().timestamp() - CACHE_FILE.stat().st_mtime) < CACHE_DURATION_SEC:
        try:
            with open(CACHE_FILE, "r") as f:
                return json.load(f)
        except: pass
    return None

def _save_cache(data):
    CACHE_FILE.parent.mkdir(exist_ok=True, parents=True)
    try:
        with open(CACHE_FILE, "w") as f:
            json.dump(data, f)
    except: pass

FEEDS = [
    "http://www.aljazeera.com/xml/rss/all.xml",
    "http://feeds.bbci.co.uk/news/world/rss.xml",
    "https://www.reutersagency.com/feed/?best-topics=political-general&post_type=best",
    "https://www.theguardian.com/world/rss",
    "https://feeds.npr.org/1004/rss.xml",
    "https://foreignpolicy.com/feed/",
    "https://www.cnbc.com/id/100727362/device/rss/rss.html",
    "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
    "https://feeds.washingtonpost.com/rss/world",
]

# Expanded keyword geocoder — covers most geopolitically relevant regions
GEO_DATA = {
    # Middle East
    "Iran": [32.4279, 53.6880], "Israel": [31.0461, 34.8516],
    "Gaza": [31.3547, 34.3088], "West Bank": [31.9466, 35.3027],
    "Lebanon": [33.8547, 35.8623], "Syria": [34.8021, 38.9968],
    "Yemen": [15.5527, 48.5164], "Iraq": [33.2232, 43.6793],
    "Saudi Arabia": [23.8859, 45.0792], "Jordan": [30.5852, 36.2384],
    "Kuwait": [29.3117, 47.4818], "Qatar": [25.3548, 51.1839],
    "UAE": [23.4241, 53.8478], "Bahrain": [26.0667, 50.5577],
    "Oman": [21.5126, 55.9233],
    # Europe
    "Ukraine": [48.3794, 31.1656], "Russia": [61.5240, 105.3188],
    "Germany": [51.1657, 10.4515], "France": [46.2276, 2.2137],
    "UK": [55.3781, -3.4360], "Poland": [51.9194, 19.1451],
    "Romania": [45.9432, 24.9668], "Finland": [61.9241, 25.7482],
    "Sweden": [60.1282, 18.6435], "Norway": [60.4720, 8.4689],
    "NATO": [50.8503, 4.3517], "Belarus": [53.7098, 27.9534],
    "Moldova": [47.4116, 28.3699], "Georgia": [42.3154, 43.3569],
    "Serbia": [44.0165, 20.9129], "Kosovo": [42.6026, 20.9030],
    # Asia-Pacific
    "China": [35.8617, 104.1954], "Taiwan": [23.6978, 120.9605],
    "North Korea": [40.3399, 127.5101], "South Korea": [35.9078, 127.7669],
    "Japan": [36.2048, 138.2529], "India": [20.5937, 78.9629],
    "Pakistan": [30.3753, 69.3451], "Afghanistan": [33.9391, 67.7100],
    "Myanmar": [21.9162, 95.9560], "Philippines": [12.8797, 121.7740],
    "Vietnam": [14.0583, 108.2772], "South China Sea": [12.0000, 113.0000],
    # Americas
    "USA": [37.0902, -95.7129], "Mexico": [23.6345, -102.5528],
    "Venezuela": [6.4238, -66.5897], "Colombia": [4.5709, -74.2973],
    "Cuba": [21.5218, -77.7812], "Nicaragua": [12.8654, -85.2072],
    "Haiti": [18.9712, -72.2852], "Brazil": [14.2350, -51.9253],
    "Argentina": [-38.4161, -63.6167], "Chile": [-35.6751, -71.5430],
    "Peru": [-9.1900, -75.0152], "Guyana": [4.8604, -58.9302],
    # Central Asia & Caucasus
    "Kazakhstan": [48.0196, 66.9237], "Azerbaijan": [40.1431, 47.5769],
    "Armenia": [40.0691, 45.0382], "Nagorno-Karabakh": [39.8177, 46.7528],
    "Uzbekistan": [41.3775, 64.5853], "Kyrgyzstan": [41.2044, 74.7661],
    # Specific Conflict Regions & Strategic Spots
    "Gaza": [31.3547, 34.3088], "West Bank": [31.9466, 35.3027],
    "Donbas": [48.0159, 37.8028], "Kashmir": [34.0837, 74.7973],
    "Sudan": [12.8628, 30.2176], "Darfur": [13.4175, 24.3311],
    "Tigray": [14.0323, 38.3166], "Somalia": [5.1521, 46.1996],
    "Suez Canal": [29.9329, 32.5539], "Panama Canal": [9.1012, -79.6967],
    "Bering Strait": [66.0, -169.0], "Malacca": [2.5, 102.0],
    # Cities
    "New York": [40.7128, -74.0060], "London": [51.5074, -0.1278],
    "Paris": [48.8566, 2.3522], "Brussels": [50.8503, 4.3517],
    "Geneva": [46.2044, 6.1432], "Vienna": [48.2082, 16.3738],
    "Istanbul": [41.0082, 28.9784], "Kyiv": [50.4501, 30.5234],
    "Moscow": [55.7558, 37.6173], "Tehran": [35.6892, 51.3890],
    "Beijing": [39.9042, 116.4074], "Tokyo": [35.6762, 139.6503],
    "Seoul": [37.5665, 126.9780],
}

EXCLUDE_KEYWORDS = [
    "sport", "football", "soccer", "la liga", "champions league", "cup", "match",
    "olympics", "tennis", "nfl", "nba", "score", "goal", "premier league",
    "formula 1", "f1", "golf", "cricket", "rugby", "boxing", "mma",
    "celebrity", "oscars", "grammy", "fashion", "movie", "film", "series",
    "recipe", "weather forecast", "horoscope"
]

# Strip HTML tags from RSS descriptions
_TAG_RE = re.compile(r'<[^>]+>')


def _strip_html(s: str) -> str:
    return _TAG_RE.sub('', s).strip()


def _find_text(el: ET.Element, tag: str) -> str:
    """Find text for a tag, checking common RSS/Atom namespaces."""
    node = el.find(tag)
    if node is not None and node.text:
        return node.text.strip()
    # Try with common namespaces
    for ns in ['{http://purl.org/dc/elements/1.1/}', '{http://purl.org/rss/1.0/}']:
        node = el.find(f'{ns}{tag}')
        if node is not None and node.text:
            return node.text.strip()
    return ''


async def _fetch_single_feed(client: httpx.AsyncClient, feed_url: str) -> list[dict]:
    """Fetch a single RSS feed directly and parse XML items."""
    articles: list[dict] = []
    try:
        resp = await client.get(feed_url, timeout=12.0, follow_redirects=True)
        if resp.status_code != 200:
            return []

        root = ET.fromstring(resp.content)
        # Determine feed title
        channel = root.find('channel')
        feed_title = 'Global Intel'
        if channel is not None:
            ft = channel.findtext('title')
            if ft:
                feed_title = ft.strip()
        else:
            # Atom feed
            ft = root.findtext('{http://www.w3.org/2005/Atom}title')
            if ft:
                feed_title = ft.strip()

        # Find items — RSS uses <item>, Atom uses <entry>
        items = root.findall('.//item')
        if not items:
            items = root.findall('.//{http://www.w3.org/2005/Atom}entry')

        for item in items[:10]:
            title = _find_text(item, 'title')
            if not title:
                # Atom title
                title = item.findtext('{http://www.w3.org/2005/Atom}title') or ''
                title = title.strip()
            if not title:
                continue

            if any(kw in title.lower() for kw in EXCLUDE_KEYWORDS):
                continue

            # Geocode from keywords
            lat, lon = None, None
            for region, coords in GEO_DATA.items():
                if re.search(r'\b' + re.escape(region) + r'\b', title, re.IGNORECASE):
                    lat, lon = coords
                    break

            # Link
            link = _find_text(item, 'link')
            if not link:
                link_el = item.find('{http://www.w3.org/2005/Atom}link')
                if link_el is not None:
                    link = link_el.get('href', '')

            # Description
            desc = _find_text(item, 'description') or _find_text(item, 'summary')
            if not desc:
                desc = item.findtext('{http://www.w3.org/2005/Atom}summary') or ''
            desc = _strip_html(desc)[:200]

            # Publication date
            pub_date = (
                _find_text(item, 'pubDate')
                or _find_text(item, 'published')
                or item.findtext('{http://www.w3.org/2005/Atom}published')
                or datetime.now(timezone.utc).isoformat()
            )

            # Image from enclosure or media:content
            image = ''
            enc = item.find('enclosure')
            if enc is not None:
                enc_url = enc.get('url', '')
                enc_type = enc.get('type', '')
                if 'image' in enc_type or enc_url.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                    image = enc_url
            if not image:
                media = item.find('{http://search.yahoo.com/mrss/}content')
                if media is not None:
                    murl = media.get('url', '')
                    if murl.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                        image = murl

            # Determine Category and Severity
            title_lower = title.lower()
            category = "GEOPOLITICS"
            if any(w in title_lower for w in ["cyber", "hacking", "breach", "malware", "botnet"]):
                category = "CYBER"
            elif any(w in title_lower for w in ["satellite", "orbit", "rocket", "launch", "space", "iss"]):
                category = "SPACE"
            elif any(w in title_lower for w in ["military", "army", "navy", "airforce", "missile", "strike", "war", "conflict", "nato", "defense"]):
                category = "MILITARY"

            severity = "MODERATE"
            if any(w in title_lower for w in ["attack", "strike", "crisis", "invasion", "nuclear", "killed"]):
                severity = "HIGH"
            if any(w in title_lower for w in ["critical", "emergency", "declaration", "imminent"]):
                severity = "CRITICAL"

            articles.append({
                "title": title,
                "source": feed_title,
                "url": link,
                "image": image or None,
                "lat": lat,
                "lon": lon,
                "summary": desc or "No details available.",
                "published_at": pub_date,
                "category": category,
                "severity": severity
            })
    except Exception as e:
        print(f"[NEWS] Feed error ({feed_url[:60]}): {e}")
    return articles


async def fetch_news():
    """Fetch all RSS feeds in parallel and return combined articles."""
    cached = _get_cached_news()
    if cached is not None:
        return cached

    try:
        async with httpx.AsyncClient(
            headers={"User-Agent": "GodsEye/2.0 RSS Reader"},
        ) as client:
            results = await asyncio.gather(
                *[_fetch_single_feed(client, url) for url in FEEDS],
                return_exceptions=True,
            )
            articles: list[dict] = []
            for r in results:
                if isinstance(r, list):
                    articles.extend(r)

            print(f"[NEWS] Fetched {len(articles)} intelligence items from {len(FEEDS)} feeds.")
            _save_cache(articles)
            return articles
    except Exception as e:
        print(f"[NEWS] Critical error: {e}")
        # fallback to stale cache if error
        if CACHE_FILE.exists():
            try:
                with open(CACHE_FILE, "r") as f:
                    return json.load(f)
            except: pass
        return []