diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 385806a..3dcd198 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -49,16 +49,71 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [ + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting requests\n", + " Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)\n", + "Collecting charset_normalizer<4,>=2 (from requests)\n", + " Using cached charset_normalizer-3.4.3-cp313-cp313-win_amd64.whl.metadata (37 kB)\n", + "Collecting idna<4,>=2.5 (from requests)\n", + " Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)\n", + "Collecting urllib3<3,>=1.21.1 (from requests)\n", + " Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)\n", + "Collecting certifi>=2017.4.17 (from requests)\n", + " Using cached certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)\n", + "Using cached requests-2.32.5-py3-none-any.whl (64 kB)\n", + "Using cached certifi-2025.8.3-py3-none-any.whl (161 kB)\n", + "Using cached charset_normalizer-3.4.3-cp313-cp313-win_amd64.whl (107 kB)\n", + "Using cached idna-3.10-py3-none-any.whl (70 kB)\n", + "Using cached urllib3-2.5.0-py3-none-any.whl (129 kB)\n", + "Installing collected packages: urllib3, idna, charset_normalizer, certifi, requests\n", + "Successfully installed certifi-2025.8.3 charset_normalizer-3.4.3 idna-3.10 requests-2.32.5 urllib3-2.5.0\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 25.0.1 -> 25.2\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "# Instalamos todos los paquetes necesarios \n", "%pip install requests" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: beautifulsoup4 in c:\\users\\patri\\onedrive\\escritorio\\maestria\\materias\\tratamiento de datos\\semana 2\\tarea2\\repo1\\python-web-scrapinggrupal\\venv_scraping\\lib\\site-packages (4.13.5)\n", + "Requirement already satisfied: soupsieve>1.2 in c:\\users\\patri\\onedrive\\escritorio\\maestria\\materias\\tratamiento de datos\\semana 2\\tarea2\\repo1\\python-web-scrapinggrupal\\venv_scraping\\lib\\site-packages (from beautifulsoup4) (2.7)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in c:\\users\\patri\\onedrive\\escritorio\\maestria\\materias\\tratamiento de datos\\semana 2\\tarea2\\repo1\\python-web-scrapinggrupal\\venv_scraping\\lib\\site-packages (from beautifulsoup4) (4.14.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 25.0.1 -> 25.2\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], "source": [ "%pip install beautifulsoup4" ] @@ -72,16 +127,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: lxml in c:\\users\\patri\\onedrive\\escritorio\\maestria\\materias\\tratamiento de datos\\semana 2\\tarea2\\repo1\\python-web-scrapinggrupal\\venv_scraping\\lib\\site-packages (6.0.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 25.0.1 -> 25.2\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], "source": [ "%pip install lxml" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "tags": [] }, @@ -126,13 +199,48 @@ { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Lo que se hace a continuacion sera leer el código html de la pagina https://www.ilga.gov/Senate/Members e imprimira solo una pequeña parte" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " English\n", + " , \n", + " Afrikaans\n", + " , \n", + " Albanian\n", + " , \n", + " Arabic\n", + " , \n", + " Armenian\n", + " , \n", + " Azerbaijani\n", + " , \n", + " Basque\n", + " , \n", + " Bengali\n", + " , \n", + " Bosnian\n", + " , \n", + " Catalan\n", + " , \n", + " Croatian\n", + " , \n", + " Czech\n", + " , \n", + " Danish\n", + " , \n", + " Dutch\n", + " , \n", + " Esperanto\n", + " , \n", + " Estonian\n", + " , \n", + " Filipino\n", + " , \n", + " Finnish\n", + " , \n", + " French\n", + " , \n", + " Galician\n", + " , \n", + " Georgian\n", + " , \n", + " German\n", + " , \n", + " Greek\n", + " , \n", + " Gujarati\n", + " , \n", + " Haitian Creole\n", + " , \n", + " Hausa\n", + " , \n", + " Hawaiian\n", + " , \n", + " Hebrew\n", + " , \n", + " Hindi\n", + " , \n", + " Hungarian\n", + " , \n", + " Icelandic\n", + " , \n", + " Indonesian\n", + " , \n", + " Interlingua\n", + " , \n", + " Interlingue\n", + " , \n", + " Inuktitut\n", + " , \n", + " Irish\n", + " , \n", + " Italian\n", + " , \n", + " Japanese\n", + " , \n", + " Javanese\n", + " , \n", + " Kannada\n", + " , \n", + " Khmer\n", + " , \n", + " Korean\n", + " , \n", + " Latin\n", + " , \n", + " Latvian\n", + " , \n", + " Lithuanian\n", + " , \n", + " Luxembourgish\n", + " , \n", + " Macedonian\n", + " , \n", + " Malagasy\n", + " , \n", + " Malayalam\n", + " , \n", + " Maltese\n", + " , \n", + " Maori\n", + " , \n", + " Marathi\n", + " , \n", + " Myanmar\n", + " , \n", + " Nepali\n", + " , \n", + " Norwegian\n", + " , \n", + " Odia\n", + " , \n", + " Pashto\n", + " , \n", + " Punjabi\n", + " , \n", + " Romanian\n", + " , \n", + " Russian\n", + " , \n", + " Samoan\n", + " , \n", + " Sango\n", + " , \n", + " Sanskrit\n", + " , \n", + " Sardinian\n", + " , \n", + " Sindhi\n", + " , \n", + " Sinhala\n", + " , \n", + " Slovak\n", + " , \n", + " Slovenian\n", + " , \n", + " Somali\n", + " , \n", + " Southern Sotho\n", + " , \n", + " Spanish\n", + " , \n", + " Sundanese\n", + " , \n", + " Swahili\n", + " , \n", + " Swedish\n", + " , \n", + " Tamil\n", + " , \n", + " Telugu\n", + " , \n", + " Thai\n", + " , \n", + " Tigrinya\n", + " , \n", + " Tonga\n", + " , \n", + " Turkish\n", + " , \n", + " Ukrainian\n", + " , \n", + " Urdu\n", + " , \n", + " Vietnamese\n", + " , \n", + " Welsh\n", + " , \n", + " Xhosa\n", + " , \n", + " Yiddish\n", + " , \n", + " Yoruba\n", + " , \n", + " Zulu\n", + " , \"GoogleTranslate, ILGA.GOV]\n" + ] + } + ], "source": [ "# Find all elements with a certain tag\n", "a_tags = soup.find_all(\"a\")\n", - "print(a_tags[:10])" + "print(a_tags[:90])" ] }, { @@ -210,11 +546,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " English\n", + " \n", + "\n", + " English\n", + " \n" + ] + } + ], "source": [ "a_tags = soup.find_all(\"a\")\n", "a_tags_alt = soup(\"a\")\n", @@ -234,6 +583,23 @@ "execution_count": null, "metadata": {}, "outputs": [], + "source": [ + "#Obtuvimos 270 links" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "270\n" + ] + } + ], "source": [ "print(len(a_tags))" ] @@ -252,14 +618,83 @@ { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Se corrigió el nombre de la clase, y lo que hacemos aqui es buscar etiquetas ¨a¨ y filtrar aquellos elementos de la clase ¨nontranslate¨" + ] + }, + { + "cell_type": "code", + "execution_count": 24, "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Get only the 'a' tags in 'sidemenu' class\n", - "side_menus = soup(\"a\", class_=\"sidemenu\")\n", - "side_menus[:5]" + "outputs": [ + { + "data": { + "text/plain": [ + "[Neil Anderson,\n", + " Neil Anderson,\n", + " Omar Aquino,\n", + " Omar Aquino,\n", + " Li Arellano, Jr.,\n", + " Li Arellano, Jr.,\n", + " Chris Balkema,\n", + " Chris Balkema,\n", + " Christopher Belt,\n", + " Christopher Belt,\n", + " Terri Bryant,\n", + " Terri Bryant,\n", + " Cristina Castro,\n", + " Cristina Castro,\n", + " Javier L. Cervantes,\n", + " Javier L. Cervantes,\n", + " Andrew S. Chesney,\n", + " Andrew S. Chesney,\n", + " Lakesia Collins,\n", + " Lakesia Collins,\n", + " Bill Cunningham,\n", + " Bill Cunningham,\n", + " John F. Curran,\n", + " John F. Curran,\n", + " Donald P. DeWitte,\n", + " Donald P. DeWitte,\n", + " Mary Edly-Allen,\n", + " Mary Edly-Allen,\n", + " Laura Ellman,\n", + " Laura Ellman,\n", + " Paul Faraci,\n", + " Paul Faraci,\n", + " Sara Feigenholtz,\n", + " Sara Feigenholtz,\n", + " Laura Fine,\n", + " Laura Fine,\n", + " Dale Fowler,\n", + " Dale Fowler,\n", + " Suzy Glowiak Hilton,\n", + " Suzy Glowiak Hilton,\n", + " Graciela Guzmán,\n", + " Graciela Guzmán,\n", + " Michael W. Halpin,\n", + " Michael W. Halpin,\n", + " Don Harmon,\n", + " Don Harmon,\n", + " Napoleon Harris, III,\n", + " Napoleon Harris, III,\n", + " Erica Harriss,\n", + " Erica Harriss]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get only the 'a' tags in 'notranslate' class\n", + "side_menus = soup(\"a\", class_=\"notranslate\")\n", + "side_menus[:50]" ] }, { @@ -274,14 +709,93 @@ { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Podemos buscar elementos ahora por medio de selectores CSS, exactamente el mismo resultado" + ] + }, + { + "cell_type": "code", + "execution_count": 25, "metadata": { "tags": [] }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Neil Anderson,\n", + " Neil Anderson,\n", + " Omar Aquino,\n", + " Omar Aquino,\n", + " Li Arellano, Jr.,\n", + " Li Arellano, Jr.,\n", + " Chris Balkema,\n", + " Chris Balkema,\n", + " Christopher Belt,\n", + " Christopher Belt,\n", + " Terri Bryant,\n", + " Terri Bryant,\n", + " Cristina Castro,\n", + " Cristina Castro,\n", + " Javier L. Cervantes,\n", + " Javier L. Cervantes,\n", + " Andrew S. Chesney,\n", + " Andrew S. Chesney,\n", + " Lakesia Collins,\n", + " Lakesia Collins,\n", + " Bill Cunningham,\n", + " Bill Cunningham,\n", + " John F. Curran,\n", + " John F. Curran,\n", + " Donald P. DeWitte,\n", + " Donald P. DeWitte,\n", + " Mary Edly-Allen,\n", + " Mary Edly-Allen,\n", + " Laura Ellman,\n", + " Laura Ellman,\n", + " Paul Faraci,\n", + " Paul Faraci,\n", + " Sara Feigenholtz,\n", + " Sara Feigenholtz,\n", + " Laura Fine,\n", + " Laura Fine,\n", + " Dale Fowler,\n", + " Dale Fowler,\n", + " Suzy Glowiak Hilton,\n", + " Suzy Glowiak Hilton,\n", + " Graciela Guzmán,\n", + " Graciela Guzmán,\n", + " Michael W. Halpin,\n", + " Michael W. Halpin,\n", + " Don Harmon,\n", + " Don Harmon,\n", + " Napoleon Harris, III,\n", + " Napoleon Harris, III,\n", + " Erica Harriss,\n", + " Erica Harriss]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get elements with \"a.notranslate\" CSS Selector.\n", + "selected = soup.select(\"a.notranslate\")\n", + "selected[:50]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "# Get elements with \"a.sidemenu\" CSS Selector.\n", - "selected = soup.select(\"a.sidemenu\")\n", - "selected[:5]" + "# Se comprueba que ambos comandos arrojan exactamente el mismo resultado, personalmente es mas facil buscar elementos\n", + "#por medio de selectores css, esdecir, con select" ] }, { @@ -290,16 +804,147 @@ "source": [ "## 🥊 Challenge: Find All\n", "\n", - "Use BeautifulSoup to find all the `a` elements with class `mainmenu`." + "Use BeautifulSoup to find all the `a` elements with class `notranslate`." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Neil Anderson,\n", + " Neil Anderson,\n", + " Omar Aquino,\n", + " Omar Aquino,\n", + " Li Arellano, Jr.,\n", + " Li Arellano, Jr.,\n", + " Chris Balkema,\n", + " Chris Balkema,\n", + " Christopher Belt,\n", + " Christopher Belt,\n", + " Terri Bryant,\n", + " Terri Bryant,\n", + " Cristina Castro,\n", + " Cristina Castro,\n", + " Javier L. Cervantes,\n", + " Javier L. Cervantes,\n", + " Andrew S. Chesney,\n", + " Andrew S. Chesney,\n", + " Lakesia Collins,\n", + " Lakesia Collins,\n", + " Bill Cunningham,\n", + " Bill Cunningham,\n", + " John F. Curran,\n", + " John F. Curran,\n", + " Donald P. DeWitte,\n", + " Donald P. DeWitte,\n", + " Mary Edly-Allen,\n", + " Mary Edly-Allen,\n", + " Laura Ellman,\n", + " Laura Ellman,\n", + " Paul Faraci,\n", + " Paul Faraci,\n", + " Sara Feigenholtz,\n", + " Sara Feigenholtz,\n", + " Laura Fine,\n", + " Laura Fine,\n", + " Dale Fowler,\n", + " Dale Fowler,\n", + " Suzy Glowiak Hilton,\n", + " Suzy Glowiak Hilton,\n", + " Graciela Guzmán,\n", + " Graciela Guzmán,\n", + " Michael W. Halpin,\n", + " Michael W. Halpin,\n", + " Don Harmon,\n", + " Don Harmon,\n", + " Napoleon Harris, III,\n", + " Napoleon Harris, III,\n", + " Erica Harriss,\n", + " Erica Harriss,\n", + " Michael E. Hastings,\n", + " Michael E. Hastings,\n", + " Darby A. Hills,\n", + " Darby A. Hills,\n", + " Linda Holmes,\n", + " Linda Holmes,\n", + " Mattie Hunter,\n", + " Mattie Hunter,\n", + " Adriane Johnson,\n", + " Adriane Johnson,\n", + " Emil Jones, III,\n", + " Emil Jones, III,\n", + " Patrick J. Joyce,\n", + " Patrick J. Joyce,\n", + " David Koehler,\n", + " David Koehler,\n", + " Seth Lewis,\n", + " Seth Lewis,\n", + " Kimberly A. Lightford,\n", + " Kimberly A. Lightford,\n", + " Meg Loughran Cappel,\n", + " Meg Loughran Cappel,\n", + " Robert F. Martwick,\n", + " Robert F. Martwick,\n", + " Steve McClure,\n", + " Steve McClure,\n", + " Julie A. Morrison,\n", + " Julie A. Morrison,\n", + " Laura M. Murphy,\n", + " Laura M. Murphy,\n", + " Robert Peters,\n", + " Robert Peters,\n", + " Jason Plummer,\n", + " Jason Plummer,\n", + " Mike Porfirio,\n", + " Mike Porfirio,\n", + " Willie Preston,\n", + " Willie Preston,\n", + " Sue Rezin,\n", + " Sue Rezin,\n", + " Chapin Rose,\n", + " Chapin Rose,\n", + " Mike Simmons,\n", + " Mike Simmons,\n", + " Elgie R. Sims, Jr.,\n", + " Elgie R. Sims, Jr.,\n", + " Steve Stadelman,\n", + " Steve Stadelman,\n", + " Dave Syverson,\n", + " Dave Syverson,\n", + " Jil Tracy,\n", + " Jil Tracy,\n", + " Doris Turner,\n", + " Doris Turner,\n", + " Sally J. Turner,\n", + " Sally J. Turner,\n", + " Rachel Ventura,\n", + " Rachel Ventura,\n", + " Karina Villa,\n", + " Karina Villa,\n", + " Celina Villanueva,\n", + " Celina Villanueva,\n", + " Ram Villivalam,\n", + " Ram Villivalam,\n", + " Mark L. Walker,\n", + " Mark L. Walker,\n", + " Craig Wilcox,\n", + " Craig Wilcox,\n", + " Dan McConchie,\n", + " Dan McConchie]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# YOUR CODE HERE\n" + "# YOUR CODE HERE\n", + "soup.select(\"a.notranslate\")" ] }, { @@ -318,21 +963,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neil Anderson\n", + "Class: \n" + ] + } + ], "source": [ "# Get all sidemenu links as a list\n", - "side_menu_links = soup.select(\"a.sidemenu\")\n", - "\n", - "# Examine the first link\n", - "first_link = side_menu_links[0]\n", - "print(first_link)\n", + "side_menu_links = soup.select(\"a.notranslate\")\n", "\n", - "# What class is this variable?\n", - "print('Class: ', type(first_link))" + "# Examine the first link, if available\n", + "if side_menu_links:\n", + "\tfirst_link = side_menu_links[0]\n", + "\tprint(first_link)\n", + "\t# What class is this variable?\n", + "\tprint('Class: ', type(first_link))\n", + "else:\n", + "\tprint(\"No sidemenu links found.\")" ] }, { @@ -344,13 +1000,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neil Anderson\n" + ] + } + ], "source": [ - "print(first_link.text)" + "# Get all sidemenu links as a list\n", + "side_menu_links = soup.select(\"a.notranslate\")\n", + "\n", + "# Examine the first link, if available\n", + "if side_menu_links:\n", + "\tfirst_link = side_menu_links[0]\n", + "\tprint(first_link.text)\n", + "else:\n", + "\tprint(\"No sidemenu links found.\")" ] }, { @@ -364,13 +1036,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Senate/Members/Details/3312\n" + ] + } + ], "source": [ - "print(first_link['href'])" + "print(first_link.get('href'))" ] }, { @@ -384,11 +1064,141 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], - "source": [ - "# YOUR CODE HERE\n" + "outputs": [ + { + "data": { + "text/plain": [ + "['/Senate/Members/Details/3312',\n", + " '/Senate/Members/Details/3312',\n", + " '/Senate/Members/Details/3316',\n", + " '/Senate/Members/Details/3316',\n", + " '/Senate/Members/Details/3383',\n", + " '/Senate/Members/Details/3383',\n", + " '/Senate/Members/Details/3413',\n", + " '/Senate/Members/Details/3413',\n", + " '/Senate/Members/Details/3337',\n", + " '/Senate/Members/Details/3337',\n", + " '/Senate/Members/Details/3386',\n", + " '/Senate/Members/Details/3386',\n", + " '/Senate/Members/Details/3317',\n", + " '/Senate/Members/Details/3317',\n", + " '/Senate/Members/Details/3403',\n", + " '/Senate/Members/Details/3403',\n", + " '/Senate/Members/Details/3410',\n", + " '/Senate/Members/Details/3410',\n", + " '/Senate/Members/Details/3443',\n", + " '/Senate/Members/Details/3443',\n", + " '/Senate/Members/Details/3291',\n", + " '/Senate/Members/Details/3291',\n", + " '/Senate/Members/Details/3329',\n", + " '/Senate/Members/Details/3329',\n", + " '/Senate/Members/Details/3334',\n", + " '/Senate/Members/Details/3334',\n", + " '/Senate/Members/Details/3407',\n", + " '/Senate/Members/Details/3407',\n", + " '/Senate/Members/Details/3339',\n", + " '/Senate/Members/Details/3339',\n", + " '/Senate/Members/Details/3412',\n", + " '/Senate/Members/Details/3412',\n", + " '/Senate/Members/Details/3376',\n", + " '/Senate/Members/Details/3376',\n", + " '/Senate/Members/Details/3338',\n", + " '/Senate/Members/Details/3338',\n", + " '/Senate/Members/Details/3318',\n", + " '/Senate/Members/Details/3318',\n", + " '/Senate/Members/Details/3341',\n", + " '/Senate/Members/Details/3341',\n", + " '/Senate/Members/Details/3442',\n", + " '/Senate/Members/Details/3442',\n", + " '/Senate/Members/Details/3408',\n", + " '/Senate/Members/Details/3408',\n", + " '/Senate/Members/Details/3268',\n", + " '/Senate/Members/Details/3268',\n", + " '/Senate/Members/Details/3292',\n", + " '/Senate/Members/Details/3292',\n", + " '/Senate/Members/Details/3411',\n", + " '/Senate/Members/Details/3411',\n", + " '/Senate/Members/Details/3293',\n", + " '/Senate/Members/Details/3293',\n", + " '/Senate/Members/Details/3460',\n", + " '/Senate/Members/Details/3460',\n", + " '/Senate/Members/Details/3270',\n", + " '/Senate/Members/Details/3270',\n", + " '/Senate/Members/Details/3269',\n", + " '/Senate/Members/Details/3269',\n", + " '/Senate/Members/Details/3378',\n", + " '/Senate/Members/Details/3378',\n", + " '/Senate/Members/Details/3276',\n", + " '/Senate/Members/Details/3276',\n", + " '/Senate/Members/Details/3372',\n", + " '/Senate/Members/Details/3372',\n", + " '/Senate/Members/Details/3271',\n", + " '/Senate/Members/Details/3271',\n", + " '/Senate/Members/Details/3406',\n", + " '/Senate/Members/Details/3406',\n", + " '/Senate/Members/Details/3264',\n", + " '/Senate/Members/Details/3264',\n", + " '/Senate/Members/Details/3380',\n", + " '/Senate/Members/Details/3380',\n", + " '/Senate/Members/Details/3369',\n", + " '/Senate/Members/Details/3369',\n", + " '/Senate/Members/Details/3342',\n", + " '/Senate/Members/Details/3342',\n", + " '/Senate/Members/Details/3294',\n", + " '/Senate/Members/Details/3294',\n", + " '/Senate/Members/Details/3313',\n", + " '/Senate/Members/Details/3313',\n", + " '/Senate/Members/Details/3343',\n", + " '/Senate/Members/Details/3343',\n", + " '/Senate/Members/Details/3344',\n", + " '/Senate/Members/Details/3344',\n", + " '/Senate/Members/Details/3404',\n", + " '/Senate/Members/Details/3404',\n", + " '/Senate/Members/Details/3405',\n", + " '/Senate/Members/Details/3405',\n", + " '/Senate/Members/Details/3281',\n", + " '/Senate/Members/Details/3281',\n", + " '/Senate/Members/Details/3295',\n", + " '/Senate/Members/Details/3295',\n", + " '/Senate/Members/Details/3398',\n", + " '/Senate/Members/Details/3398',\n", + " '/Senate/Members/Details/3331',\n", + " '/Senate/Members/Details/3331',\n", + " '/Senate/Members/Details/3296',\n", + " '/Senate/Members/Details/3296',\n", + " '/Senate/Members/Details/3265',\n", + " '/Senate/Members/Details/3265',\n", + " '/Senate/Members/Details/3319',\n", + " '/Senate/Members/Details/3319',\n", + " '/Senate/Members/Details/3399',\n", + " '/Senate/Members/Details/3399',\n", + " '/Senate/Members/Details/3397',\n", + " '/Senate/Members/Details/3397',\n", + " '/Senate/Members/Details/3409',\n", + " '/Senate/Members/Details/3409',\n", + " '/Senate/Members/Details/3385',\n", + " '/Senate/Members/Details/3385',\n", + " '/Senate/Members/Details/3375',\n", + " '/Senate/Members/Details/3375',\n", + " '/Senate/Members/Details/3345',\n", + " '/Senate/Members/Details/3345',\n", + " '/Senate/Members/Details/3449',\n", + " '/Senate/Members/Details/3449',\n", + " '/Senate/Members/Details/3336',\n", + " '/Senate/Members/Details/3336',\n", + " '/Senate/Members/Details/3315',\n", + " '/Senate/Members/Details/3315']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[link['href'] for link in soup.select(\"a.notranslate\")]\n" ] }, { @@ -417,14 +1227,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Make a GET request\n", - "req = requests.get('http://www.ilga.gov/senate/default.asp?GA=98')\n", + "req = requests.get('https://www.ilga.gov/Senate/Members/List')\n", "# Read the content of the server’s response\n", "src = req.text\n", "# Soup it\n", @@ -442,13 +1252,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "62" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get all table row elements\n", "rows = soup.find_all(\"tr\")\n", - "len(rows)" + "len(rows)\n", + "#rows" ] }, { @@ -460,14 +1282,411 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, - "outputs": [], - "source": [ - "# Returns every ‘tr tr tr’ css selector in the page\n", - "rows = soup.select('tr tr tr')\n", - "\n", - "for row in rows[:5]:\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "Senator\n", + "↓↑\n", + "\n", + "\n", + "District\n", + "↓↑\n", + "\n", + "\n", + "Party\n", + "↓↑\n", + "\n", + " \n", + "\n", + "\n", + "Neil Anderson\n", + "47\n", + "R\n", + " \n", + "\n", + "\n", + "Omar Aquino\n", + "2\n", + "D\n", + " \n", + "\n", + "\n", + "Li Arellano, Jr.\n", + "37\n", + "R\n", + " \n", + "\n", + "\n", + "Chris Balkema\n", + "53\n", + "R\n", + " \n", + "\n", + "\n", + "Christopher Belt\n", + "57\n", + "D\n", + " \n", + "\n", + "\n", + "Terri Bryant\n", + "58\n", + "R\n", + " \n", + "\n", + "\n", + "Cristina Castro\n", + "22\n", + "D\n", + " \n", + "\n", + "\n", + "Javier L. Cervantes\n", + "1\n", + "D\n", + " \n", + "\n", + "\n", + "Andrew S. Chesney\n", + "45\n", + "R\n", + " \n", + "\n", + "\n", + "Lakesia Collins\n", + "5\n", + "D\n", + " \n", + "\n", + "\n", + "Bill Cunningham\n", + "18\n", + "D\n", + " \n", + "\n", + "\n", + "John F. Curran\n", + "41\n", + "R\n", + " \n", + "\n", + "\n", + "Donald P. DeWitte\n", + "33\n", + "R\n", + " \n", + "\n", + "\n", + "Mary Edly-Allen\n", + "31\n", + "D\n", + " \n", + "\n", + "\n", + "Laura Ellman\n", + "21\n", + "D\n", + " \n", + "\n", + "\n", + "Paul Faraci\n", + "52\n", + "D\n", + " \n", + "\n", + "\n", + "Sara Feigenholtz\n", + "6\n", + "D\n", + " \n", + "\n", + "\n", + "Laura Fine\n", + "9\n", + "D\n", + " \n", + "\n", + "\n", + "Dale Fowler\n", + "59\n", + "R\n", + " \n", + "\n", + "\n", + "Suzy Glowiak Hilton\n", + "23\n", + "D\n", + " \n", + "\n", + "\n", + "Graciela Guzmán\n", + "20\n", + "D\n", + " \n", + "\n", + "\n", + "Michael W. Halpin\n", + "36\n", + "D\n", + " \n", + "\n", + "\n", + "Don Harmon\n", + "39\n", + "D\n", + " \n", + "\n", + "\n", + "Napoleon Harris, III\n", + "15\n", + "D\n", + " \n", + "\n", + "\n", + "Erica Harriss\n", + "56\n", + "R\n", + " \n", + "\n", + "\n", + "Michael E. Hastings\n", + "19\n", + "D\n", + " \n", + "\n", + "\n", + "Darby A. Hills\n", + "26\n", + "R\n", + " \n", + "\n", + "\n", + "Linda Holmes\n", + "42\n", + "D\n", + " \n", + "\n", + "\n", + "Mattie Hunter\n", + "3\n", + "D\n", + " \n", + "\n", + "\n", + "Adriane Johnson\n", + "30\n", + "D\n", + " \n", + "\n", + "\n", + "Emil Jones, III\n", + "14\n", + "D\n", + " \n", + "\n", + "\n", + "Patrick J. Joyce\n", + "40\n", + "D\n", + " \n", + "\n", + "\n", + "David Koehler\n", + "46\n", + "D\n", + " \n", + "\n", + "\n", + "Seth Lewis\n", + "24\n", + "R\n", + " \n", + "\n", + "\n", + "Kimberly A. Lightford\n", + "4\n", + "D\n", + " \n", + "\n", + "\n", + "Meg Loughran Cappel\n", + "49\n", + "D\n", + " \n", + "\n", + "\n", + "Robert F. Martwick\n", + "10\n", + "D\n", + " \n", + "\n", + "\n", + "Steve McClure\n", + "54\n", + "R\n", + " \n", + "\n", + "\n", + "Julie A. Morrison\n", + "29\n", + "D\n", + " \n", + "\n", + "\n", + "Laura M. Murphy\n", + "28\n", + "D\n", + " \n", + "\n", + "\n", + "Robert Peters\n", + "13\n", + "D\n", + " \n", + "\n", + "\n", + "Jason Plummer\n", + "55\n", + "R\n", + " \n", + "\n", + "\n", + "Mike Porfirio\n", + "11\n", + "D\n", + " \n", + "\n", + "\n", + "Willie Preston\n", + "16\n", + "D\n", + " \n", + "\n", + "\n", + "Sue Rezin\n", + "38\n", + "R\n", + " \n", + "\n", + "\n", + "Chapin Rose\n", + "51\n", + "R\n", + " \n", + "\n", + "\n", + "Mike Simmons\n", + "7\n", + "D\n", + " \n", + "\n", + "\n", + "Elgie R. Sims, Jr.\n", + "17\n", + "D\n", + " \n", + "\n", + "\n", + "Steve Stadelman\n", + "34\n", + "D\n", + " \n", + "\n", + "\n", + "Dave Syverson\n", + "35\n", + "R\n", + " \n", + "\n", + "\n", + "Jil Tracy\n", + "50\n", + "R\n", + " \n", + "\n", + "\n", + "Doris Turner\n", + "48\n", + "D\n", + " \n", + "\n", + "\n", + "Sally J. Turner\n", + "44\n", + "R\n", + " \n", + "\n", + "\n", + "Rachel Ventura\n", + "43\n", + "D\n", + " \n", + "\n", + "\n", + "Karina Villa\n", + "25\n", + "D\n", + " \n", + "\n", + "\n", + "Celina Villanueva\n", + "12\n", + "D\n", + " \n", + "\n", + "\n", + "Ram Villivalam\n", + "8\n", + "D\n", + " \n", + "\n", + "\n", + "Mark L. Walker\n", + "27\n", + "D\n", + " \n", + "\n", + "\n", + "Craig Wilcox\n", + "32\n", + "R\n", + " \n", + "\n", + "\n", + "\n", + "Senator\n", + "↓↑\n", + "\n", + "\n", + "District\n", + "↓↑\n", + "\n", + "\n", + "Party\n", + "↓↑\n", + "\n", + " \n", + "\n", + "\n", + "Dan McConchie\n", + "26\n", + "R\n", + " \n", + "\n" + ] + } + ], + "source": [ + "# Returns every ‘tr’ css selector in the page\n", + "rows = soup.select('tr')\n", + "\n", + "for row in rows[:500]:\n", " print(row, '\\n')" ] }, @@ -480,9 +1699,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " \n", + " \n", + " Omar Aquino\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " D\n", + " \n", + "\n", + "\n" + ] + } + ], "source": [ "example_row = rows[2]\n", "print(example_row.prettify())" @@ -501,9 +1741,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Omar Aquino\n", + "2\n", + "D\n", + "\n", + "\n", + "\n" + ] + } + ], "source": [ "for cell in example_row.select('td'):\n", " print(cell)\n", @@ -527,13 +1780,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": { "tags": [] }, "outputs": [], "source": [ - "assert example_row.select('td') == example_row.select('.detail') == example_row.select('td.detail')" + "# Only compare selectors that should return the same elements\n", + "assert example_row.select('td.detail') == example_row.select('.detail')" ] }, { @@ -545,12 +1799,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Omar Aquino,\n", + " 2,\n", + " D]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Select only those 'td' tags with class 'detail' \n", - "detail_cells = example_row.select('td.detail')\n", + "detail_cells = example_row.select('td')\n", "detail_cells" ] }, @@ -563,9 +1830,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Omar Aquino', '2', 'D']\n" + ] + } + ], "source": [ "# Keep only the text in each of those cells\n", "row_data = [cell.text for cell in detail_cells]\n", @@ -582,13 +1857,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Omar Aquino\n", + "2\n", + "D\n" + ] + } + ], "source": [ "print(row_data[0]) # Name\n", - "print(row_data[3]) # District\n", - "print(row_data[4]) # Party" + "print(row_data[1]) # District\n", + "print(row_data[2]) # Party" ] }, { @@ -602,9 +1887,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Row 0:\n", + " \n", + "\n", + "Senator\n", + "↓↑\n", + "\n", + "\n", + "District\n", + "↓↑\n", + "\n", + "\n", + "Party\n", + "↓↑\n", + "\n", + " \n", + "\n", + "Row 1:\n", + " \n", + "Neil Anderson\n", + "47\n", + "R\n", + " \n", + "\n", + "Last Row:\n", + " \n", + "Dan McConchie\n", + "26\n", + "R\n", + "\n" + ] + } + ], "source": [ "print('Row 0:\\n', rows[0], '\\n')\n", "print('Row 1:\\n', rows[1], '\\n')\n", @@ -622,9 +1943,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7\n", + "7\n", + "7\n", + "7\n" + ] + } + ], "source": [ "# Bad rows\n", "print(len(rows[0]))\n", @@ -644,11 +1976,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, - "outputs": [], - "source": [ - "good_rows = [row for row in rows if len(row) == 5]\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Neil Anderson\n", + "47\n", + "R\n", + " \n", + "\n", + "\n", + "Craig Wilcox\n", + "32\n", + "R\n", + " \n", + "\n", + "\n", + "Dan McConchie\n", + "26\n", + "R\n", + "\n" + ] + } + ], + "source": [ + "good_rows = [row for row in rows if row.select('td')]\n", "\n", "# Let's check some rows\n", "print(good_rows[0], '\\n')\n", @@ -665,27 +2021,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 142, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Omar Aquino,\n", + " 2,\n", + " D]" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "rows[2].select('td.detail') " + "rows[2].select('td') " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 143, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Dan McConchie, 26, R] \n", + "\n", + "[Christopher Belt, 57, D] \n", + "\n", + "Checking rows...\n", + "\n", + "\n", + "Neil Anderson\n", + "47\n", + "R\n", + " \n", + "\n", + "\n", + "Dan McConchie\n", + "26\n", + "R\n", + "\n" + ] + } + ], "source": [ "# Bad row\n", - "print(rows[-1].select('td.detail'), '\\n')\n", + "print(rows[-1].select('td'), '\\n')\n", "\n", "# Good row\n", - "print(rows[5].select('td.detail'), '\\n')\n", + "print(rows[5].select('td'), '\\n')\n", "\n", "# How about this?\n", - "good_rows = [row for row in rows if row.select('td.detail')]\n", + "good_rows = [row for row in rows if row.select('td')]\n", "\n", "print(\"Checking rows...\\n\")\n", "print(good_rows[0], '\\n')\n", @@ -710,7 +2103,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 145, "metadata": { "tags": [] }, @@ -720,18 +2113,18 @@ "members = []\n", "\n", "# Get rid of junk rows\n", - "valid_rows = [row for row in rows if row.select('td.detail')]\n", + "valid_rows = [row for row in rows if row.select('td')]\n", "\n", "# Loop through all rows\n", "for row in valid_rows:\n", " # Select only those 'td' tags with class 'detail'\n", - " detail_cells = row.select('td.detail')\n", + " detail_cells = row.select('td')\n", " # Keep only the text in each of those cells\n", " row_data = [cell.text for cell in detail_cells]\n", " # Collect information\n", " name = row_data[0]\n", - " district = int(row_data[3])\n", - " party = row_data[4]\n", + " district = int(row_data[1])\n", + " party = row_data[2]\n", " # Store in a tuple\n", " senator = (name, district, party)\n", " # Append to list\n", @@ -740,9 +2133,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 146, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "60" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Should be 61\n", "len(members)" @@ -757,9 +2161,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 147, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('Neil Anderson', 47, 'R'), ('Omar Aquino', 2, 'D'), ('Li Arellano, Jr.', 37, 'R'), ('Chris Balkema', 53, 'R'), ('Christopher Belt', 57, 'D')]\n" + ] + } + ], "source": [ "print(members[:5])" ] @@ -803,14 +2215,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 148, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Make a GET request\n", - "req = requests.get('http://www.ilga.gov/senate/default.asp?GA=98')\n", + "req = requests.get('https://www.ilga.gov/Senate/Members/List')\n", "# Read the content of the server’s response\n", "src = req.text\n", "# Soup it\n", @@ -819,20 +2231,20 @@ "members = []\n", "\n", "# Returns every ‘tr tr tr’ css selector in the page\n", - "rows = soup.select('tr tr tr')\n", + "rows = soup.select('tr')\n", "# Get rid of junk rows\n", - "rows = [row for row in rows if row.select('td.detail')]\n", + "rows = [row for row in rows if row.select('td')]\n", "\n", "# Loop through all rows\n", "for row in rows:\n", " # Select only those 'td' tags with class 'detail'\n", - " detail_cells = row.select('td.detail') \n", + " detail_cells = row.select('td') \n", " # Keep only the text in each of those cells\n", " row_data = [cell.text for cell in detail_cells]\n", " # Collect information\n", " name = row_data[0]\n", - " district = int(row_data[3])\n", - " party = row_data[4]\n", + " district = int(row_data[1])\n", + " party = row_data[2]\n", "\n", " # YOUR CODE HERE\n", " full_path = ''\n", @@ -845,14 +2257,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 149, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[('Neil Anderson', 47, 'R', ''),\n", + " ('Omar Aquino', 2, 'D', ''),\n", + " ('Li Arellano, Jr.', 37, 'R', ''),\n", + " ('Chris Balkema', 53, 'R', ''),\n", + " ('Christopher Belt', 57, 'D', '')]" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Uncomment to test \n", - "# members[:5]" + "members[:5]" ] }, { @@ -873,22 +2300,113 @@ "outputs": [], "source": [ "# YOUR CODE HERE\n", - "def get_members(url):\n", - " return [___]\n" + "def get_senate_member_links(url):\n", + " req = requests.get(url)\n", + " soup = BeautifulSoup(req.text, \"lxml\")\n", + " members = []\n", + " table = soup.find(\"table\")\n", + " if table:\n", + " for row in table.find_all(\"tr\")[1:]: # Skip header\n", + " cells = row.find_all(\"td\")\n", + " if len(cells) > 0:\n", + " link_tag = cells[0].find(\"a\")\n", + " if link_tag and link_tag.get(\"href\"):\n", + " name = link_tag.text.strip()\n", + " profile_url = \"https://www.ilga.gov/Senate/Members\" + link_tag[\"href\"]\n", + " members.append((name, profile_url))\n", + " return members" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Se extraen los link de los miembros del senado, se verifica que en los links hay un \"/\" adicional, quitando el adicional los links funcionan y redirigen a la infromacion." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 171, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total miembros: 59\n", + "('Neil Anderson', 'https://www.ilga.gov//Senate/Members/Details/3312')\n", + "('Omar Aquino', 'https://www.ilga.gov//Senate/Members/Details/3316')\n", + "('Li Arellano, Jr.', 'https://www.ilga.gov//Senate/Members/Details/3383')\n", + "('Chris Balkema', 'https://www.ilga.gov//Senate/Members/Details/3413')\n", + "('Christopher Belt', 'https://www.ilga.gov//Senate/Members/Details/3337')\n", + "('Terri Bryant', 'https://www.ilga.gov//Senate/Members/Details/3386')\n", + "('Cristina Castro', 'https://www.ilga.gov//Senate/Members/Details/3317')\n", + "('Javier L. Cervantes', 'https://www.ilga.gov//Senate/Members/Details/3403')\n", + "('Andrew S. Chesney', 'https://www.ilga.gov//Senate/Members/Details/3410')\n", + "('Lakesia Collins', 'https://www.ilga.gov//Senate/Members/Details/3443')\n", + "('Bill Cunningham', 'https://www.ilga.gov//Senate/Members/Details/3291')\n", + "('John F. Curran', 'https://www.ilga.gov//Senate/Members/Details/3329')\n", + "('Donald P. DeWitte', 'https://www.ilga.gov//Senate/Members/Details/3334')\n", + "('Mary Edly-Allen', 'https://www.ilga.gov//Senate/Members/Details/3407')\n", + "('Laura Ellman', 'https://www.ilga.gov//Senate/Members/Details/3339')\n", + "('Paul Faraci', 'https://www.ilga.gov//Senate/Members/Details/3412')\n", + "('Sara Feigenholtz', 'https://www.ilga.gov//Senate/Members/Details/3376')\n", + "('Laura Fine', 'https://www.ilga.gov//Senate/Members/Details/3338')\n", + "('Dale Fowler', 'https://www.ilga.gov//Senate/Members/Details/3318')\n", + "('Suzy Glowiak Hilton', 'https://www.ilga.gov//Senate/Members/Details/3341')\n", + "('Graciela Guzmán', 'https://www.ilga.gov//Senate/Members/Details/3442')\n", + "('Michael W. Halpin', 'https://www.ilga.gov//Senate/Members/Details/3408')\n", + "('Don Harmon', 'https://www.ilga.gov//Senate/Members/Details/3268')\n", + "('Napoleon Harris, III', 'https://www.ilga.gov//Senate/Members/Details/3292')\n", + "('Erica Harriss', 'https://www.ilga.gov//Senate/Members/Details/3411')\n", + "('Michael E. Hastings', 'https://www.ilga.gov//Senate/Members/Details/3293')\n", + "('Darby A. Hills', 'https://www.ilga.gov//Senate/Members/Details/3460')\n", + "('Linda Holmes', 'https://www.ilga.gov//Senate/Members/Details/3270')\n", + "('Mattie Hunter', 'https://www.ilga.gov//Senate/Members/Details/3269')\n", + "('Adriane Johnson', 'https://www.ilga.gov//Senate/Members/Details/3378')\n", + "('Emil Jones, III', 'https://www.ilga.gov//Senate/Members/Details/3276')\n", + "('Patrick J. Joyce', 'https://www.ilga.gov//Senate/Members/Details/3372')\n", + "('David Koehler', 'https://www.ilga.gov//Senate/Members/Details/3271')\n", + "('Seth Lewis', 'https://www.ilga.gov//Senate/Members/Details/3406')\n", + "('Kimberly A. Lightford', 'https://www.ilga.gov//Senate/Members/Details/3264')\n", + "('Meg Loughran Cappel', 'https://www.ilga.gov//Senate/Members/Details/3380')\n", + "('Robert F. Martwick', 'https://www.ilga.gov//Senate/Members/Details/3369')\n", + "('Steve McClure', 'https://www.ilga.gov//Senate/Members/Details/3342')\n", + "('Julie A. Morrison', 'https://www.ilga.gov//Senate/Members/Details/3294')\n", + "('Laura M. Murphy', 'https://www.ilga.gov//Senate/Members/Details/3313')\n", + "('Robert Peters', 'https://www.ilga.gov//Senate/Members/Details/3343')\n", + "('Jason Plummer', 'https://www.ilga.gov//Senate/Members/Details/3344')\n", + "('Mike Porfirio', 'https://www.ilga.gov//Senate/Members/Details/3404')\n", + "('Willie Preston', 'https://www.ilga.gov//Senate/Members/Details/3405')\n", + "('Sue Rezin', 'https://www.ilga.gov//Senate/Members/Details/3281')\n", + "('Chapin Rose', 'https://www.ilga.gov//Senate/Members/Details/3295')\n", + "('Mike Simmons', 'https://www.ilga.gov//Senate/Members/Details/3398')\n", + "('Elgie R. Sims, Jr.', 'https://www.ilga.gov//Senate/Members/Details/3331')\n", + "('Steve Stadelman', 'https://www.ilga.gov//Senate/Members/Details/3296')\n", + "('Dave Syverson', 'https://www.ilga.gov//Senate/Members/Details/3265')\n", + "('Jil Tracy', 'https://www.ilga.gov//Senate/Members/Details/3319')\n", + "('Doris Turner', 'https://www.ilga.gov//Senate/Members/Details/3399')\n", + "('Sally J. Turner', 'https://www.ilga.gov//Senate/Members/Details/3397')\n", + "('Rachel Ventura', 'https://www.ilga.gov//Senate/Members/Details/3409')\n", + "('Karina Villa', 'https://www.ilga.gov//Senate/Members/Details/3385')\n", + "('Celina Villanueva', 'https://www.ilga.gov//Senate/Members/Details/3375')\n", + "('Ram Villivalam', 'https://www.ilga.gov//Senate/Members/Details/3345')\n", + "('Mark L. Walker', 'https://www.ilga.gov//Senate/Members/Details/3449')\n", + "('Craig Wilcox', 'https://www.ilga.gov//Senate/Members/Details/3336')\n" + ] + } + ], "source": [ "# Test your code\n", - "url = 'http://www.ilga.gov/senate/default.asp?GA=98'\n", - "senate_members = get_members(url)\n", - "len(senate_members)" + "url = 'https://www.ilga.gov/Senate/Members/List'\n", + "members = get_senate_member_links(url)\n", + "len(members)\n", + "\n", + "print(f\"Total miembros: {len(members)}\")\n", + "for member in members:\n", + " print(member)\n" ] }, { @@ -914,40 +2432,54 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 168, "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_bills(url):\n", + " import requests\n", + " from bs4 import BeautifulSoup\n", " src = requests.get(url).text\n", - " soup = BeautifulSoup(src)\n", + " soup = BeautifulSoup(src, \"lxml\")\n", " rows = soup.select('tr')\n", " bills = []\n", " for row in rows:\n", - " # YOUR CODE HERE\n", - " bill_id =\n", - " description =\n", - " chamber =\n", - " last_action =\n", - " last_action_date =\n", - " bill = (bill_id, description, chamber, last_action, last_action_date)\n", - " bills.append(bill)\n", + " cells = row.find_all('td', class_='billlist')\n", + " if len(cells) >= 5:\n", + " bill_id = cells[0].text.strip()\n", + " description = cells[1].text.strip()\n", + " chamber = cells[2].text.strip()\n", + " last_action = cells[3].text.strip()\n", + " last_action_date = cells[4].text.strip()\n", + " bill = (bill_id, description, chamber, last_action, last_action_date)\n", + " bills.append(bill)\n", " return bills" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 172, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Uncomment to test your code\n", - "# test_url = senate_members[0][3]\n", - "# get_bills(test_url)[0:5]" + "test_url = members[0][1]\n", + "get_bills(test_url)[0:5]" ] }, { @@ -967,9 +2499,22 @@ "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'senate_members' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[55]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtime\u001b[39;00m\n\u001b[32m 3\u001b[39m bills_dict = {}\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m member \u001b[38;5;129;01min\u001b[39;00m \u001b[43msenate_members\u001b[49m:\n\u001b[32m 6\u001b[39m name, district, party, bill_url = member\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m bill_url:\n", + "\u001b[31mNameError\u001b[39m: name 'senate_members' is not defined" + ] + } + ], "source": [ - "# YOUR CODE HERE\n" + "# YOUR CODE HERE\n", + "#debido a los cambios de la pagina no se puede obtener la informacion que se solicitaba en el enunciado" ] }, { @@ -988,7 +2533,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv_scraping", "language": "python", "name": "python3" }, @@ -1002,12 +2547,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "b6f9fe9f4b7182690503d8ecc2bae97b0ee3ebf54e877167ae4d28c119a56988" - } + "version": "3.13.3" } }, "nbformat": 4,