245 lines
10 KiB
Python
245 lines
10 KiB
Python
import httpx
|
|
import asyncio
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from difflib import SequenceMatcher
|
|
|
|
# GDELT DOC 2.0 API — free, no key
|
|
# Focused specifically on military attacks and kinetic events
|
|
GDELT_URL = (
|
|
"https://api.gdeltproject.org/api/v2/doc/doc?"
|
|
"query=(missile attack OR rocket attack OR airstrike OR air strike OR "
|
|
"bombing OR drone strike OR artillery fire OR shelling OR warship attack OR "
|
|
"naval attack OR military strike OR armed attack OR mortar OR explosion site OR "
|
|
"IED explosion OR sniper OR combat OR military offensive OR invasion force)"
|
|
"&mode=artlist&format=json&maxrecords=75&sourcelang=english×pan=12h"
|
|
)
|
|
|
|
# Geo-coder: keyword → (lat, lon)
|
|
GEO_DATA = {
|
|
# Ukraine conflict zones
|
|
"Ukraine": [48.3794, 31.1656], "Crimea": [44.9521, 34.1024],
|
|
"Donbas": [48.0159, 37.8028], "Donbass": [48.0159, 37.8028],
|
|
"Kherson": [46.6354, 32.6169], "Zaporizhzhia": [47.8388, 35.1396],
|
|
"Kharkiv": [49.9935, 36.2304], "Kyiv": [50.4501, 30.5234],
|
|
"Kiev": [50.4501, 30.5234], "Odesa": [46.4825, 30.7233],
|
|
"Mariupol": [47.0958, 37.5483], "Bakhmut": [48.5963, 38.0000],
|
|
"Avdiivka": [48.1344, 37.7490],
|
|
# Russia
|
|
"Russia": [61.5240, 105.3188], "Moscow": [55.7558, 37.6173],
|
|
"Belarus": [53.7098, 27.9534],
|
|
# Middle East
|
|
"Israel": [31.0461, 34.8516], "Gaza": [31.3547, 34.3088],
|
|
"West Bank": [31.9466, 35.3027], "Rafah": [31.2969, 34.2455],
|
|
"Jenin": [32.4607, 35.3027], "Lebanon": [33.8547, 35.8623],
|
|
"Hezbollah": [33.8547, 35.8623], "Syria": [34.8021, 38.9968],
|
|
"Damascus": [33.5138, 36.2765], "Aleppo": [36.2021, 37.1343],
|
|
"Iran": [32.4279, 53.6880], "Tehran": [35.6892, 51.3890],
|
|
"Iraq": [33.2232, 43.6793], "Baghdad": [33.3152, 44.3661],
|
|
"Yemen": [15.5527, 48.5164], "Houthi": [15.5527, 48.5164],
|
|
"Saudi Arabia": [23.8859, 45.0792],
|
|
# Asia-Pacific
|
|
"China": [35.8617, 104.1954], "Taiwan": [23.6978, 120.9605],
|
|
"Taipei": [25.0330, 121.5654], "Taiwan Strait": [24.0000, 119.5000],
|
|
"North Korea": [40.3399, 127.5101], "Pyongyang": [39.0194, 125.7381],
|
|
"South Korea": [35.9078, 127.7669], "Japan": [36.2048, 138.2529],
|
|
"Philippines": [12.8797, 121.7740], "South China Sea": [12.0000, 113.0000],
|
|
"Myanmar": [21.9162, 95.9560], "India": [20.5937, 78.9629],
|
|
"Pakistan": [30.3753, 69.3451], "Afghanistan": [33.9391, 67.7100],
|
|
"Kashmir": [34.0837, 74.7973],
|
|
# Africa
|
|
"Sudan": [12.8628, 30.2176], "Ethiopia": [9.1450, 40.4897],
|
|
"Somalia": [5.1521, 46.1996], "Libya": [26.3351, 17.2283],
|
|
"Mali": [17.5707, -3.9962], "Niger": [17.6078, 8.0817],
|
|
"Nigeria": [9.0820, 8.6753], "Congo": [-4.0383, 21.7587],
|
|
"Sahel": [15.4542, 0.0000], "Burkina Faso": [12.2383, -1.5616],
|
|
"Mozambique": [-18.6657, 35.5296],
|
|
# Americas
|
|
"Venezuela": [6.4238, -66.5897], "Colombia": [4.5709, -74.2973],
|
|
"Mexico": [23.6345, -102.5528],
|
|
# Strategic waterways
|
|
"Red Sea": [20.0000, 38.0000], "Strait of Hormuz": [26.5667, 56.2500],
|
|
"Bab-el-Mandeb": [12.6, 43.5], "Suez Canal": [30.4550, 32.3500],
|
|
"Black Sea": [43.4000, 34.0000], "Baltic Sea": [58.0000, 20.0000],
|
|
"East China Sea": [30.0000, 126.0000], "Sea of Azov": [46.0000, 36.5000],
|
|
"Persian Gulf": [26.0000, 52.0000],
|
|
# Palestine
|
|
"Palestine": [31.9522, 35.2332], "Jerusalem": [31.7683, 35.2137],
|
|
"Tel Aviv": [32.0853, 34.7818], "Nablus": [32.2211, 35.2544],
|
|
}
|
|
|
|
# Attack type classifier — ordered by priority (most specific first)
|
|
# Each entry: (search_term, event_label, severity)
|
|
ATTACK_CLASSIFIER = [
|
|
("ballistic missile", "BALLISTIC MISSILE", "CRITICAL"),
|
|
("cruise missile", "CRUISE MISSILE", "CRITICAL"),
|
|
("hypersonic missile", "HYPERSONIC MISSILE", "CRITICAL"),
|
|
("missile strike", "MISSILE STRIKE", "CRITICAL"),
|
|
("missile attack", "MISSILE ATTACK", "CRITICAL"),
|
|
("rocket attack", "ROCKET ATTACK", "CRITICAL"),
|
|
("rocket barrage", "ROCKET BARRAGE", "CRITICAL"),
|
|
("drone strike", "DRONE STRIKE", "CRITICAL"),
|
|
("drone attack", "DRONE ATTACK", "CRITICAL"),
|
|
("airstrike", "AIRSTRIKE", "CRITICAL"),
|
|
("air strike", "AIRSTRIKE", "CRITICAL"),
|
|
("air raid", "AIR RAID", "CRITICAL"),
|
|
("bombing", "BOMBING", "CRITICAL"),
|
|
("bomb", "EXPLOSION", "HIGH"),
|
|
("shelling", "ARTILLERY SHELLING", "HIGH"),
|
|
("artillery", "ARTILLERY FIRE", "HIGH"),
|
|
("mortar", "MORTAR ATTACK", "HIGH"),
|
|
("ied explosion", "IED EXPLOSION", "HIGH"),
|
|
("explosion", "EXPLOSION", "HIGH"),
|
|
("sniper", "SNIPER ACTIVITY", "MODERATE"),
|
|
("naval attack", "NAVAL ATTACK", "HIGH"),
|
|
("warship", "NAVAL ACTIVITY", "MODERATE"),
|
|
("invasion", "INVASION", "CRITICAL"),
|
|
("offensive", "MILITARY OFFENSIVE", "HIGH"),
|
|
("combat", "COMBAT", "HIGH"),
|
|
("attack", "ATTACK", "HIGH"),
|
|
("strike", "STRIKE", "HIGH"),
|
|
("troops", "GROUND FORCES", "MODERATE"),
|
|
("military", "MILITARY ACTIVITY", "MODERATE"),
|
|
]
|
|
|
|
SEVERITY_ORDER = {"CRITICAL": 0, "HIGH": 1, "MODERATE": 2}
|
|
|
|
EXCLUDE_KEYWORDS = [
|
|
"sport", "football", "soccer", "la liga", "cup", "olympics",
|
|
"tennis", "nfl", "nba", "golf", "cricket", "rugby",
|
|
"celebrity", "fashion", "movie", "film", "oscars", "grammy",
|
|
"recipe", "horoscope", "box office", "netflix",
|
|
"rocket launch", "spacex", "starship", "nasa launch", # civilian launches
|
|
]
|
|
|
|
|
|
def classify_event(title: str) -> tuple[str, str]:
|
|
"""Returns (event_label, severity) for a conflict title."""
|
|
tl = title.lower()
|
|
for term, label, severity in ATTACK_CLASSIFIER:
|
|
if term in tl:
|
|
return label, severity
|
|
return "MILITARY EVENT", "MODERATE"
|
|
|
|
|
|
def _title_similarity(a: str, b: str) -> float:
|
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
|
|
|
|
def _parse_gdelt_date(seendate: str) -> str:
|
|
try:
|
|
dt = datetime.strptime(seendate, "%Y%m%dT%H%M%SZ")
|
|
return dt.replace(tzinfo=timezone.utc).isoformat()
|
|
except Exception:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
async def fetch_conflicts() -> list:
|
|
"""
|
|
Fetch military attack/conflict events from GDELT DOC API.
|
|
Returns events classified by attack type and severity.
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
# Retry with backoff for 429 rate limits
|
|
for attempt in range(3):
|
|
if attempt > 0:
|
|
await asyncio.sleep(6 * attempt)
|
|
response = await client.get(GDELT_URL, timeout=12.0)
|
|
if response.status_code == 429:
|
|
print(f"[CONFLICTS] GDELT rate limited, retry {attempt+1}/3")
|
|
continue
|
|
break
|
|
if response.status_code != 200:
|
|
print(f"[CONFLICTS] GDELT returned {response.status_code}")
|
|
return []
|
|
|
|
try:
|
|
data = response.json() if response.text.strip() else {}
|
|
except Exception:
|
|
print(f"[CONFLICTS] GDELT returned non-JSON body")
|
|
return []
|
|
raw_articles = data.get("articles", [])
|
|
if not raw_articles:
|
|
print("[CONFLICTS] No articles in GDELT response")
|
|
return []
|
|
|
|
events = []
|
|
|
|
for article in raw_articles:
|
|
title = (article.get("title") or "").strip()
|
|
if not title:
|
|
continue
|
|
|
|
tl = title.lower()
|
|
|
|
# Filter out non-conflict content
|
|
if any(kw in tl for kw in EXCLUDE_KEYWORDS):
|
|
continue
|
|
|
|
# Geocode
|
|
lat, lon = None, None
|
|
for region, coords in GEO_DATA.items():
|
|
if re.search(r'\b' + re.escape(region) + r'\b', title, re.IGNORECASE):
|
|
lat, lon = coords
|
|
break
|
|
|
|
if lat is None or lon is None:
|
|
continue
|
|
if not (-90 <= lat <= 90) or not (-180 <= lon <= 180):
|
|
continue
|
|
|
|
# Deduplicate by title similarity
|
|
is_duplicate = any(
|
|
_title_similarity(title, ev["title"]) > 0.75
|
|
for ev in events
|
|
)
|
|
if is_duplicate:
|
|
continue
|
|
|
|
event_label, severity = classify_event(title)
|
|
seendate = article.get("seendate", "")
|
|
published = _parse_gdelt_date(seendate) if seendate else datetime.now(timezone.utc).isoformat()
|
|
|
|
events.append({
|
|
"title": title,
|
|
"url": article.get("url", ""),
|
|
"image": article.get("urlMobileImage", ""),
|
|
"source": article.get("domain", "Unknown"),
|
|
"domain": article.get("sourcecountry", "Unknown"),
|
|
"lat": lat,
|
|
"lon": lon,
|
|
"published_at": published,
|
|
"event_type": event_label,
|
|
"severity": severity,
|
|
"type": "conflict",
|
|
})
|
|
|
|
# Sort: CRITICAL first, then by date
|
|
events.sort(key=lambda e: (
|
|
SEVERITY_ORDER.get(e["severity"], 3),
|
|
e["published_at"]
|
|
), reverse=False)
|
|
events.sort(key=lambda e: SEVERITY_ORDER.get(e["severity"], 3))
|
|
|
|
events = events[:40]
|
|
|
|
by_sev = {}
|
|
for e in events:
|
|
s = e["severity"]
|
|
by_sev[s] = by_sev.get(s, 0) + 1
|
|
|
|
print(f"[CONFLICTS] {len(events)} military events — {by_sev} (from {len(raw_articles)} raw)")
|
|
return events
|
|
|
|
except Exception as e:
|
|
print(f"[CONFLICTS] Fetch error: {e}")
|
|
return []
|
|
|
|
|
|
if __name__ == "__main__":
|
|
result = asyncio.run(fetch_conflicts())
|
|
print(f"\nMilitary events: {len(result)}")
|
|
for ev in result[:15]:
|
|
print(f" [{ev['severity']:8s}] [{ev['event_type']:22s}] {ev['title'][:70]}")
|