Files
2026-03-09 22:07:19 +01:00

245 lines
10 KiB
Python

import httpx
import asyncio
import re
from datetime import datetime, timezone
from difflib import SequenceMatcher
# GDELT DOC 2.0 API — free, no key
# Focused specifically on military attacks and kinetic events
GDELT_URL = (
"https://api.gdeltproject.org/api/v2/doc/doc?"
"query=(missile attack OR rocket attack OR airstrike OR air strike OR "
"bombing OR drone strike OR artillery fire OR shelling OR warship attack OR "
"naval attack OR military strike OR armed attack OR mortar OR explosion site OR "
"IED explosion OR sniper OR combat OR military offensive OR invasion force)"
"&mode=artlist&format=json&maxrecords=75&sourcelang=english&timespan=12h"
)
# Geo-coder: keyword → (lat, lon)
GEO_DATA = {
# Ukraine conflict zones
"Ukraine": [48.3794, 31.1656], "Crimea": [44.9521, 34.1024],
"Donbas": [48.0159, 37.8028], "Donbass": [48.0159, 37.8028],
"Kherson": [46.6354, 32.6169], "Zaporizhzhia": [47.8388, 35.1396],
"Kharkiv": [49.9935, 36.2304], "Kyiv": [50.4501, 30.5234],
"Kiev": [50.4501, 30.5234], "Odesa": [46.4825, 30.7233],
"Mariupol": [47.0958, 37.5483], "Bakhmut": [48.5963, 38.0000],
"Avdiivka": [48.1344, 37.7490],
# Russia
"Russia": [61.5240, 105.3188], "Moscow": [55.7558, 37.6173],
"Belarus": [53.7098, 27.9534],
# Middle East
"Israel": [31.0461, 34.8516], "Gaza": [31.3547, 34.3088],
"West Bank": [31.9466, 35.3027], "Rafah": [31.2969, 34.2455],
"Jenin": [32.4607, 35.3027], "Lebanon": [33.8547, 35.8623],
"Hezbollah": [33.8547, 35.8623], "Syria": [34.8021, 38.9968],
"Damascus": [33.5138, 36.2765], "Aleppo": [36.2021, 37.1343],
"Iran": [32.4279, 53.6880], "Tehran": [35.6892, 51.3890],
"Iraq": [33.2232, 43.6793], "Baghdad": [33.3152, 44.3661],
"Yemen": [15.5527, 48.5164], "Houthi": [15.5527, 48.5164],
"Saudi Arabia": [23.8859, 45.0792],
# Asia-Pacific
"China": [35.8617, 104.1954], "Taiwan": [23.6978, 120.9605],
"Taipei": [25.0330, 121.5654], "Taiwan Strait": [24.0000, 119.5000],
"North Korea": [40.3399, 127.5101], "Pyongyang": [39.0194, 125.7381],
"South Korea": [35.9078, 127.7669], "Japan": [36.2048, 138.2529],
"Philippines": [12.8797, 121.7740], "South China Sea": [12.0000, 113.0000],
"Myanmar": [21.9162, 95.9560], "India": [20.5937, 78.9629],
"Pakistan": [30.3753, 69.3451], "Afghanistan": [33.9391, 67.7100],
"Kashmir": [34.0837, 74.7973],
# Africa
"Sudan": [12.8628, 30.2176], "Ethiopia": [9.1450, 40.4897],
"Somalia": [5.1521, 46.1996], "Libya": [26.3351, 17.2283],
"Mali": [17.5707, -3.9962], "Niger": [17.6078, 8.0817],
"Nigeria": [9.0820, 8.6753], "Congo": [-4.0383, 21.7587],
"Sahel": [15.4542, 0.0000], "Burkina Faso": [12.2383, -1.5616],
"Mozambique": [-18.6657, 35.5296],
# Americas
"Venezuela": [6.4238, -66.5897], "Colombia": [4.5709, -74.2973],
"Mexico": [23.6345, -102.5528],
# Strategic waterways
"Red Sea": [20.0000, 38.0000], "Strait of Hormuz": [26.5667, 56.2500],
"Bab-el-Mandeb": [12.6, 43.5], "Suez Canal": [30.4550, 32.3500],
"Black Sea": [43.4000, 34.0000], "Baltic Sea": [58.0000, 20.0000],
"East China Sea": [30.0000, 126.0000], "Sea of Azov": [46.0000, 36.5000],
"Persian Gulf": [26.0000, 52.0000],
# Palestine
"Palestine": [31.9522, 35.2332], "Jerusalem": [31.7683, 35.2137],
"Tel Aviv": [32.0853, 34.7818], "Nablus": [32.2211, 35.2544],
}
# Attack type classifier — ordered by priority (most specific first)
# Each entry: (search_term, event_label, severity)
ATTACK_CLASSIFIER = [
("ballistic missile", "BALLISTIC MISSILE", "CRITICAL"),
("cruise missile", "CRUISE MISSILE", "CRITICAL"),
("hypersonic missile", "HYPERSONIC MISSILE", "CRITICAL"),
("missile strike", "MISSILE STRIKE", "CRITICAL"),
("missile attack", "MISSILE ATTACK", "CRITICAL"),
("rocket attack", "ROCKET ATTACK", "CRITICAL"),
("rocket barrage", "ROCKET BARRAGE", "CRITICAL"),
("drone strike", "DRONE STRIKE", "CRITICAL"),
("drone attack", "DRONE ATTACK", "CRITICAL"),
("airstrike", "AIRSTRIKE", "CRITICAL"),
("air strike", "AIRSTRIKE", "CRITICAL"),
("air raid", "AIR RAID", "CRITICAL"),
("bombing", "BOMBING", "CRITICAL"),
("bomb", "EXPLOSION", "HIGH"),
("shelling", "ARTILLERY SHELLING", "HIGH"),
("artillery", "ARTILLERY FIRE", "HIGH"),
("mortar", "MORTAR ATTACK", "HIGH"),
("ied explosion", "IED EXPLOSION", "HIGH"),
("explosion", "EXPLOSION", "HIGH"),
("sniper", "SNIPER ACTIVITY", "MODERATE"),
("naval attack", "NAVAL ATTACK", "HIGH"),
("warship", "NAVAL ACTIVITY", "MODERATE"),
("invasion", "INVASION", "CRITICAL"),
("offensive", "MILITARY OFFENSIVE", "HIGH"),
("combat", "COMBAT", "HIGH"),
("attack", "ATTACK", "HIGH"),
("strike", "STRIKE", "HIGH"),
("troops", "GROUND FORCES", "MODERATE"),
("military", "MILITARY ACTIVITY", "MODERATE"),
]
SEVERITY_ORDER = {"CRITICAL": 0, "HIGH": 1, "MODERATE": 2}
EXCLUDE_KEYWORDS = [
"sport", "football", "soccer", "la liga", "cup", "olympics",
"tennis", "nfl", "nba", "golf", "cricket", "rugby",
"celebrity", "fashion", "movie", "film", "oscars", "grammy",
"recipe", "horoscope", "box office", "netflix",
"rocket launch", "spacex", "starship", "nasa launch", # civilian launches
]
def classify_event(title: str) -> tuple[str, str]:
"""Returns (event_label, severity) for a conflict title."""
tl = title.lower()
for term, label, severity in ATTACK_CLASSIFIER:
if term in tl:
return label, severity
return "MILITARY EVENT", "MODERATE"
def _title_similarity(a: str, b: str) -> float:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def _parse_gdelt_date(seendate: str) -> str:
try:
dt = datetime.strptime(seendate, "%Y%m%dT%H%M%SZ")
return dt.replace(tzinfo=timezone.utc).isoformat()
except Exception:
return datetime.now(timezone.utc).isoformat()
async def fetch_conflicts() -> list:
"""
Fetch military attack/conflict events from GDELT DOC API.
Returns events classified by attack type and severity.
"""
try:
async with httpx.AsyncClient() as client:
# Retry with backoff for 429 rate limits
for attempt in range(3):
if attempt > 0:
await asyncio.sleep(6 * attempt)
response = await client.get(GDELT_URL, timeout=12.0)
if response.status_code == 429:
print(f"[CONFLICTS] GDELT rate limited, retry {attempt+1}/3")
continue
break
if response.status_code != 200:
print(f"[CONFLICTS] GDELT returned {response.status_code}")
return []
try:
data = response.json() if response.text.strip() else {}
except Exception:
print(f"[CONFLICTS] GDELT returned non-JSON body")
return []
raw_articles = data.get("articles", [])
if not raw_articles:
print("[CONFLICTS] No articles in GDELT response")
return []
events = []
for article in raw_articles:
title = (article.get("title") or "").strip()
if not title:
continue
tl = title.lower()
# Filter out non-conflict content
if any(kw in tl for kw in EXCLUDE_KEYWORDS):
continue
# Geocode
lat, lon = None, None
for region, coords in GEO_DATA.items():
if re.search(r'\b' + re.escape(region) + r'\b', title, re.IGNORECASE):
lat, lon = coords
break
if lat is None or lon is None:
continue
if not (-90 <= lat <= 90) or not (-180 <= lon <= 180):
continue
# Deduplicate by title similarity
is_duplicate = any(
_title_similarity(title, ev["title"]) > 0.75
for ev in events
)
if is_duplicate:
continue
event_label, severity = classify_event(title)
seendate = article.get("seendate", "")
published = _parse_gdelt_date(seendate) if seendate else datetime.now(timezone.utc).isoformat()
events.append({
"title": title,
"url": article.get("url", ""),
"image": article.get("urlMobileImage", ""),
"source": article.get("domain", "Unknown"),
"domain": article.get("sourcecountry", "Unknown"),
"lat": lat,
"lon": lon,
"published_at": published,
"event_type": event_label,
"severity": severity,
"type": "conflict",
})
# Sort: CRITICAL first, then by date
events.sort(key=lambda e: (
SEVERITY_ORDER.get(e["severity"], 3),
e["published_at"]
), reverse=False)
events.sort(key=lambda e: SEVERITY_ORDER.get(e["severity"], 3))
events = events[:40]
by_sev = {}
for e in events:
s = e["severity"]
by_sev[s] = by_sev.get(s, 0) + 1
print(f"[CONFLICTS] {len(events)} military events — {by_sev} (from {len(raw_articles)} raw)")
return events
except Exception as e:
print(f"[CONFLICTS] Fetch error: {e}")
return []
if __name__ == "__main__":
result = asyncio.run(fetch_conflicts())
print(f"\nMilitary events: {len(result)}")
for ev in result[:15]:
print(f" [{ev['severity']:8s}] [{ev['event_type']:22s}] {ev['title'][:70]}")