Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 122 additions & 64 deletions solutions/02_web_scraping_solutions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -243,14 +243,13 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 84,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[WARN] No se pudo leer https://www.ilga.gov/Senate/List: 404 Client Error: Not Found for url: https://www.ilga.gov/Senate/List\n",
"Perfiles encontrados en la lista: 60\n",
"Total miembros parseados: 60\n",
"('Member', 8505, 'D', 'https://www.ilga.gov/Senate/Members/Details/3264')\n",
Expand All @@ -272,7 +271,7 @@
"3 Member 5966 D https://www.ilga.gov/Senate/Members/Details/3269\n",
"4 Member 422 D https://www.ilga.gov/Senate/Members/Details/3270\n",
"\n",
"CSV generado: senado_ilga_moderno.csv\n"
"CSV generado: senado_ilga_members.csv\n"
]
}
],
Expand All @@ -284,7 +283,7 @@
"\n",
"LIST_URLS = [\n",
" \"https://www.ilga.gov/Senate/Members/List\",\n",
" \"https://www.ilga.gov/Senate/List\",\n",
" \"https://www.ilga.gov/Senate/Members\",\n",
"]\n",
"BASE = \"https://www.ilga.gov\"\n",
"\n",
Expand Down Expand Up @@ -415,7 +414,7 @@
" print(\"\\nPrimeras 5 filas:\")\n",
" print(df.head())\n",
" df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n",
" print(\"\\nCSV generado: senado_ilga_moderno.csv\")\n",
" print(\"\\nCSV generado: senado_ilga_members.csv\")\n",
" except ImportError:\n",
" print(\"Pandas no está instalado; omitiendo CSV. Instala con: pip install pandas openpyxl\")\n",
"\n",
Expand Down Expand Up @@ -547,53 +546,107 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import re, time\n",
"from urllib.parse import urljoin\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"def get_members(url: str = \"https://www.ilga.gov/Senate/Members/List\"):\n",
"HEADERS = {\"User-Agent\": \"Mozilla/5.0\"}\n",
"\n",
"# --- util: extraer distrito/partido desde texto plano ---\n",
"def _extract_district_party(text: str):\n",
" # 1) \"47 R\"\n",
" m = re.search(r'\\b(\\d+)\\s+([DRI])\\b', text)\n",
" if m:\n",
" return int(m.group(1)), m.group(2).upper()\n",
" # 2) \"District 47 (R)\" u otras variantes\n",
" m = re.search(r'(?:District\\s*)?(\\d+).*?\\b([DRI])\\b', text, flags=re.I)\n",
" if m:\n",
" return int(m.group(1)), m.group(2).upper()\n",
" # 3) Si solo viene el nombre del partido completo\n",
" party = \"\"\n",
" if re.search(r'\\bDemocrat(ic)?\\b', text, flags=re.I):\n",
" party = \"D\"\n",
" elif re.search(r'\\bRepublican\\b', text, flags=re.I):\n",
" party = \"R\"\n",
" elif re.search(r'\\bIndependent\\b', text, flags=re.I):\n",
" party = \"I\"\n",
" return None, party # distrito desconocido, partido si se detectó\n",
"\n",
"# --- leer distrito/partido desde el perfil individual ---\n",
"def _parse_profile(profile_url: str, session: requests.Session):\n",
" r = session.get(profile_url, timeout=30)\n",
" r.raise_for_status()\n",
" psoup = BeautifulSoup(r.text, \"lxml\")\n",
"\n",
" # nombre (por si quieres validar)\n",
" name = \"\"\n",
" h1 = psoup.select_one(\"h1\")\n",
" if h1:\n",
" name = h1.get_text(strip=True)\n",
" elif psoup.title:\n",
" name = psoup.title.get_text(strip=True)\n",
"\n",
" text = psoup.get_text(\" \", strip=True)\n",
" district, party = _extract_district_party(text)\n",
" return name, district, party\n",
"\n",
"def get_members(url: str = \"https://www.ilga.gov/Senate/Members\"):\n",
" \"\"\"\n",
" Devuelve una lista de tuplas (Nombre, Distrito:int, Partido:str, Perfil:str)\n",
" extraídas desde la página moderna del Senado de Illinois.\n",
" Devuelve lista de tuplas (Nombre, Distrito:int|None, Partido:str, Perfil:str)\n",
" Tomando enlaces a /Senate/Members/Details/... desde /Senate/Members o /Senate/Members/List.\n",
" Si el distrito/partido no está cerca del enlace, se visita el perfil para extraerlos.\n",
" \"\"\"\n",
" headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
" resp = requests.get(url, headers=headers, timeout=30)\n",
" resp.raise_for_status()\n",
" s = requests.Session()\n",
" s.headers.update(HEADERS)\n",
"\n",
" r = s.get(url, timeout=30)\n",
" r.raise_for_status()\n",
" soup = BeautifulSoup(r.text, \"lxml\")\n",
"\n",
" soup = BeautifulSoup(resp.text, \"lxml\")\n",
" members = []\n",
" seen = set()\n",
"\n",
" # Enlaces a perfiles /Senate/Members/Details/<id>\n",
" # Enlaces a perfiles\n",
" for a in soup.select('a[href*=\"/Senate/Members/Details/\"], a[href*=\"/senate/members/details/\"]'):\n",
" name = a.get_text(strip=True)\n",
" if not name:\n",
" continue\n",
"\n",
" # En el contenedor suele venir \"Nombre 47 R\" (número + partido)\n",
" full_text = a.parent.get_text(\" \", strip=True)\n",
" tail = full_text.replace(name, \"\").strip()\n",
"\n",
" # 1) patrón directo: \"47 R\"\n",
" m = re.search(r'(\\d+)\\s+([DRI])\\b', tail)\n",
" # 2) fallback: \"District 47 (R)\" u otras variantes\n",
" if not m:\n",
" m = re.search(r'(?:District\\s*)?(\\d+).*?([DRI])\\b', tail, re.I)\n",
" if not m:\n",
" # si no se detecta distrito/partido, igual guarda el nombre y el perfil\n",
" district, party = None, \"\"\n",
" else:\n",
" district = int(m.group(1))\n",
" party = m.group(2).upper()\n",
"\n",
" profile = urljoin(url, a.get(\"href\"))\n",
" profile = urljoin(url, a.get(\"href\") or \"\")\n",
" if profile in seen:\n",
" continue\n",
" seen.add(profile)\n",
"\n",
" # 1) Intentar extraer distrito/partido del contenedor más cercano\n",
" container = a.parent\n",
" # sube hasta 3 niveles si es necesario (algunas páginas usan divs anidados)\n",
" hops = 0\n",
" while container and hops < 3 and len(container.get_text(strip=True)) < 10:\n",
" container = container.parent\n",
" hops += 1\n",
"\n",
" tail_text = \"\"\n",
" if container:\n",
" # texto del contenedor sin el nombre, para evitar falsos positivos\n",
" ctext = container.get_text(\" \", strip=True)\n",
" tail_text = ctext.replace(name, \"\").strip()\n",
"\n",
" district, party = _extract_district_party(tail_text)\n",
"\n",
" # 2) Si no encontramos, entramos al perfil\n",
" if district is None and not party:\n",
" _, district, party = _parse_profile(profile, s)\n",
" time.sleep(0.2) # cortesía con el servidor\n",
"\n",
" members.append((name, district, party, profile))\n",
"\n",
" return members\n"
" return members\n",
"\n"
]
},
{
Expand Down Expand Up @@ -644,27 +697,28 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 97,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total miembros: 60\n",
"('Neil Anderson', None, '', 'https://www.ilga.gov/Senate/Members/Details/3312')\n",
"('Omar Aquino', None, '', 'https://www.ilga.gov/Senate/Members/Details/3316')\n",
"('Li Arellano, Jr.', None, '', 'https://www.ilga.gov/Senate/Members/Details/3383')\n",
"('Chris Balkema', None, '', 'https://www.ilga.gov/Senate/Members/Details/3413')\n",
"('Christopher Belt', None, '', 'https://www.ilga.gov/Senate/Members/Details/3337')\n"
"('Neil Anderson', 2006, 'R', 'https://www.ilga.gov/Senate/Members/Details/3312')\n",
"('Omar Aquino', 2016, 'D', 'https://www.ilga.gov/Senate/Members/Details/3316')\n",
"('Li Arellano, Jr.', 2025, 'D', 'https://www.ilga.gov/Senate/Members/Details/3383')\n",
"('Chris Balkema', 2025, 'R', 'https://www.ilga.gov/Senate/Members/Details/3413')\n",
"('Christopher Belt', 2019, 'R', 'https://www.ilga.gov/Senate/Members/Details/3337')\n"
]
}
],
"source": [
"senate_members = get_members() # o get_members(\"https://www.ilga.gov/Senate/Members/List\")\n",
"print(\"Total miembros:\", len(senate_members))\n",
"for m in senate_members[:5]:\n",
" print(m)\n"
" print(m)\n",
"\n"
]
},
{
Expand All @@ -679,19 +733,19 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 98,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Nombre Distrito Partido \\\n",
"0 Neil Anderson None \n",
"1 Omar Aquino None \n",
"2 Li Arellano, Jr. None \n",
"3 Chris Balkema None \n",
"4 Christopher Belt None \n",
" Nombre Distrito Partido \\\n",
"0 Neil Anderson 2006.0 R \n",
"1 Omar Aquino 2016.0 D \n",
"2 Li Arellano, Jr. 2025.0 D \n",
"3 Chris Balkema 2025.0 R \n",
"4 Christopher Belt 2019.0 R \n",
"\n",
" Perfil \n",
"0 https://www.ilga.gov/Senate/Members/Details/3312 \n",
Expand All @@ -707,7 +761,8 @@
"\n",
"df = pd.DataFrame(senate_members, columns=[\"Nombre\", \"Distrito\", \"Partido\", \"Perfil\"])\n",
"print(df.head())\n",
"# df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n"
"\n",
"df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n"
]
},
{
Expand Down Expand Up @@ -742,7 +797,7 @@
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -867,7 +922,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -879,15 +934,18 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 104,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Clave encontrada: None\n",
"No hay datos para distrito 52.\n"
"Clave encontrada: 2006\n",
"Clave encontrada: 2016\n",
"Clave encontrada: 2025\n",
"Clave encontrada: 2019\n",
"Número de proyectos para distrito 2006: 0\n"
]
}
],
Expand All @@ -897,36 +955,36 @@
" print(\"Clave encontrada:\", key)\n",
"\n",
"# Acceder de forma segura\n",
"if 52 in bills_dict:\n",
" print(\"Número de proyectos para distrito 52:\", len(bills_dict[52]))\n",
"if 2006 in bills_dict:\n",
" print(\"Número de proyectos para distrito 2006:\", len(bills_dict[2006]))\n",
"else:\n",
" print(\"No hay datos para distrito 52.\")\n"
" print(\"No hay datos para distrito 2006.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 107,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Claves disponibles en bills_dict: [None]\n",
"Total claves: 1\n",
"No hay datos para el distrito 52.\n"
"Claves disponibles en bills_dict: [2006, 2016, 2025, 2019]\n",
"Total claves: 4\n",
"No hay datos para el distrito 2019.\n"
]
}
],
"source": [
"print(\"Claves disponibles en bills_dict:\", list(bills_dict.keys()))\n",
"print(\"Total claves:\", len(bills_dict))\n",
"\n",
"bills_52 = bills_dict.get(52) or bills_dict.get(\"52\")\n",
"if bills_52 is None:\n",
" print(\"No hay datos para el distrito 52.\")\n",
"bills_2019 = bills_dict.get(2019) or bills_dict.get(\"2019\")\n",
"if bills_2019 is None:\n",
" print(\"No hay datos para el distrito 2019.\")\n",
"else:\n",
" print(\"Número de proyectos:\", len(bills_52))\n"
" print(\"Número de proyectos:\", len(bills_2019))\n"
]
}
],
Expand Down
Loading