From b50777ec2f087662acfff73cbae0977bde462442 Mon Sep 17 00:00:00 2001 From: robertomalave Date: Sat, 23 Aug 2025 17:36:45 -0500 Subject: [PATCH 1/8] archivo con cambios en el codigo funcioando --- lessons/02_web_scraping.ipynb | 2311 ++++++++++++++++++++++++++++++--- 1 file changed, 2163 insertions(+), 148 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 385806a..8be777e 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -47,18 +47,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: requests in c:\\users\\roberto\\anaconda3\\lib\\site-packages (2.32.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from requests) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from requests) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from requests) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from requests) (2025.4.26)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install requests" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: beautifulsoup4 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (4.12.3)\n", + "Requirement already satisfied: soupsieve>1.2 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from beautifulsoup4) (2.5)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install beautifulsoup4" ] @@ -72,16 +95,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: lxml in c:\\users\\roberto\\anaconda3\\lib\\site-packages (5.3.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install lxml" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "tags": [] }, @@ -125,18 +157,439 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " Illinois General Assembly - Members\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
Select Language
\n", + " \n", + "
\n", + "
\n", + "

The Illinois General Assembly offers the Google Translate™ service for visitor convenience. In no way should it be considered accurate as to the translation of any content herein.

\n", + "

Visitors of the Illinois General Assembly website are encouraged to use other translation services available on the internet.

\n", + "

The English language version is always the official and authoritative version of this website.

\n", + "

NOTE: To return to the original English language version, select the \"Show Original\" button on the Google Translate™ menu bar at the top of the window.

\n", + "
\n", + " \n", + "
\n", + " \n", + " English\n", + " \n", + " \n", + " Afrikaans\n", + " \n", + " \n", + " Albanian\n", + " \n", + " \n", + " Arabic\n", + " \n", + " \n", + " Armenian\n", + " \n", + " \n", + " Azerbaijani\n", + " \n", + " \n", + " Basque\n", + " \n", + " \n", + " Bengali\n", + " \n", + " \n", + " Bosnian\n", + " \n", + " \n", + " Catalan\n", + " \n", + " \n", + " Croatian\n", + " \n", + " \n", + " Czech\n", + " \n", + " \n", + " Danish\n", + " \n", + " \n", + " Dutch\n", + " \n", + " \n", + " Esperanto\n", + " \n", + " \n", + " Estonian\n", + " \n", + " \n", + " Filipino\n", + " \n", + " \n", + " Finnish\n", + " \n", + " \n", + " French\n", + " \n", + " \n", + " Galician\n", + " \n", + " \n", + " Georgian\n", + " \n", + " \n", + " German\n", + " \n", + " \n", + " Greek\n", + " \n", + " \n", + " Gujarati\n", + " \n", + " \n", + " Haitian Creole\n", + " \n", + " \n", + " Hausa\n", + " \n", + " \n", + " Hawaiian\n", + " \n", + " \n", + " Hebrew\n", + " \n", + " \n", + " Hindi\n", + " \n", + " \n", + " Hungarian\n", + " \n", + " \n", + " Icelandic\n", + " \n", + " \n", + " Indonesian\n", + " \n", + " \n", + " Interlingua\n", + " \n", + " \n", + " Interlingue\n", + " \n", + " \n", + " Inuktitut\n", + " \n", + " \n", + " Irish\n", + " \n", + " \n", + " Italian\n", + " \n", + " \n", + " Japanese\n", + " \n", + " \n", + " Javanese\n", + " \n", + " \n", + " Kannada\n", + " \n", + " \n", + " Khmer\n", + " \n", + " \n", + " Korean\n", + " \n", + " \n", + " Latin\n", + " \n", + " \n", + " Latvian\n", + " \n", + " \n", + " Lithuanian\n", + " \n", + " \n", + " Luxembourgish\n", + " \n", + " \n", + " Macedonian\n", + " \n", + " \n", + " Malagasy\n", + " \n", + " \n", + " Malayalam\n", + " \n", + " \n", + " Maltese\n", + " \n", + " \n", + " Maori\n", + " \n", + " \n", + " Marathi\n", + " \n", + " \n", + " Myanmar\n", + " \n", + " \n", + " Nepali\n", + " \n", + " \n", + " Norwegian\n", + " \n", + " \n", + " Odia\n", + " \n", + " \n", + " Pashto\n", + " \n", + " \n", + " Punjabi\n", + " \n", + " \n", + " Romanian\n", + " \n", + " \n", + " Russian\n", + " \n", + " \n", + " Samoan\n", + " \n", + " \n", + " Sango\n", + " \n", + " \n", + " Sanskrit\n", + " \n", + " \n", + " Sardinian\n", + " \n", + " \n", + " Sindhi\n", + " \n", + " \n", + " Sinhala\n", + " \n", + " \n", + " Slovak\n", + " \n", + " \n", + " Slovenian\n", + " \n", + " \n", + " Somali\n", + " \n", + " \n", + " Southern Sotho\n", + " \n", + " \n", + " Spanish\n", + " \n", + " \n", + " Sundanese\n", + " \n", + " \n", + " Swahili\n", + " \n", + " \n", + " Swedish\n", + " \n", + " \n", + " Tamil\n", + " \n", + " \n", + " Telugu\n", + " \n", + " \n", + " Thai\n", + " \n", + " \n", + " Tigrinya\n", + " \n", + " \n", + " Tonga\n", + " \n", + " \n", + " Turkish\n", + " \n", + " \n", + " Ukrainian\n", + " \n", + " \n", + " Urdu\n", + " \n", + " \n", + " Vietnamese\n", + " \n", + " \n", + " Welsh\n", + " \n", + " \n", + " Xhosa\n", + " \n", + " \n", + " Yiddish\n", + " \n", + " \n", + " Yoruba\n", + " \n", + " \n", + " Zulu\n", + " \n", + "
\n", + "
\n", + " Powered by \"GoogleTranslate\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
ILGA.gov Virtual Assistant
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "

I'm your ILGA.gov Virtual Assistant. Ask me about legislation, bill status, and more.

\n" + ] + } + ], "source": [ "# Make a GET request\n", - "req = requests.get('http://www.ilga.gov/senate/default.asp')\n", + "req = requests.get('https://www.ilga.gov/Senate/Members')\n", "# Read the content of the server’s response\n", "src = req.text\n", "# View some output\n", - "print(src[:1000])" + "print(src[:30000])" ] }, { @@ -152,14 +605,219 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " Illinois General Assembly - Members\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " Select Language\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " The Illinois General Assembly offers the Google Translate™ service for visitor convenience. In no way should it be considered accurate as to the translation of any content herein.\n", + "

\n", + "

\n", + " Visitors of the Illinois General Assembly website are encouraged to use other translation services available on the internet.\n", + "

\n", + "

\n", + " The English language version is always the official and authoritative version of this website.\n", + "

\n", + "

\n", + " NOTE: To return to the original English language version, select the \"Show Original\" button on the Google Translate™ menu bar at the top of the window.\n", + "

\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " English\n", + " \n", + " \n", + " \n", + " \n", + " Afrikaans\n", + " \n", + " \n", + " \n", + " \n", + " Albanian\n", + " \n", + " \n", + " \n", + " \n", + " Arabic\n", + " \n", + " \n", + " \n", + " \n", + " Armenian\n", + " \n", + " \n", + " \n", + " \n", + " Azerbaijani\n", + " \n", + " \n", + " \n", + " \n", + " Basque\n", + " \n", + " \n", + " \n", + " \n", + " Bengali\n", + " \n", + " \n", + " English\n", + " , \n", + " Afrikaans\n", + " , \n", + " Albanian\n", + " , \n", + " Arabic\n", + " , \n", + " Armenian\n", + " , \n", + " Azerbaijani\n", + " , \n", + " Basque\n", + " , \n", + " Bengali\n", + " , \n", + " Bosnian\n", + " , \n", + " Catalan\n", + " , \n", + " Croatian\n", + " , \n", + " Czech\n", + " , \n", + " Danish\n", + " , \n", + " Dutch\n", + " , \n", + " Esperanto\n", + " , \n", + " Estonian\n", + " , \n", + " Filipino\n", + " , \n", + " Finnish\n", + " , \n", + " French\n", + " , \n", + " Galician\n", + " , \n", + " Georgian\n", + " , \n", + " German\n", + " , \n", + " Greek\n", + " , \n", + " Gujarati\n", + " , \n", + " Haitian Creole\n", + " , \n", + " Hausa\n", + " , \n", + " Hawaiian\n", + " , \n", + " Hebrew\n", + " , \n", + " Hindi\n", + " , \n", + " Hungarian\n", + " , \n", + " Icelandic\n", + " , \n", + " Indonesian\n", + " , \n", + " Interlingua\n", + " , \n", + " Interlingue\n", + " , \n", + " Inuktitut\n", + " , \n", + " Irish\n", + " , \n", + " Italian\n", + " , \n", + " Japanese\n", + " , \n", + " Javanese\n", + " , \n", + " Kannada\n", + " , \n", + " Khmer\n", + " , \n", + " Korean\n", + " , \n", + " Latin\n", + " , \n", + " Latvian\n", + " , \n", + " Lithuanian\n", + " , \n", + " Luxembourgish\n", + " , \n", + " Macedonian\n", + " , \n", + " Malagasy\n", + " , \n", + " Malayalam\n", + " , \n", + " Maltese\n", + " , \n", + " Maori\n", + " , \n", + " Marathi\n", + " , \n", + " Myanmar\n", + " , \n", + " Nepali\n", + " , \n", + " Norwegian\n", + " , \n", + " Odia\n", + " , \n", + " Pashto\n", + " , \n", + " Punjabi\n", + " , \n", + " Romanian\n", + " , \n", + " Russian\n", + " , \n", + " Samoan\n", + " , \n", + " Sango\n", + " , \n", + " Sanskrit\n", + " , \n", + " Sardinian\n", + " , \n", + " Sindhi\n", + " , \n", + " Sinhala\n", + " , \n", + " Slovak\n", + " , \n", + " Slovenian\n", + " , \n", + " Somali\n", + " , \n", + " Southern Sotho\n", + " , \n", + " Spanish\n", + " , \n", + " Sundanese\n", + " , \n", + " Swahili\n", + " , \n", + " Swedish\n", + " , \n", + " Tamil\n", + " , \n", + " Telugu\n", + " , \n", + " Thai\n", + " , \n", + " Tigrinya\n", + " , \n", + " Tonga\n", + " , \n", + " Turkish\n", + " , \n", + " Ukrainian\n", + " , \n", + " Urdu\n", + " , \n", + " Vietnamese\n", + " , \n", + " Welsh\n", + " , \n", + " Xhosa\n", + " , \n", + " Yiddish\n", + " , \n", + " Yoruba\n", + " , \n", + " Zulu\n", + " , \"GoogleTranslate, ILGA.GOV, \n", + "LEGISLATION & LAWS \n", + ", Bills & Resolutions, Public Acts, Illinois Compiled Statutes, Illinois Constitution, Search Legislation, Glossary, Guide, \n", + "Reports & Inquiry \n", + ", Legislative Reports, Special Reports, FTP Site, Legislator Lookup, Capitol Complex Phone Numbers, \n", + "Rules & Regulations \n", + ", Illinois Register, Administrative Rules, \n", + "Senate \n", + ", Members, Schedules, Committees,  Request for Remote Testimony, Journals, Transcripts, Rules, Audio/Video, FOIA Information, Senate Employment Opportunities, Media Guidelines, \n", + "House \n", + ", Members, Schedules, Committees,  Submit testimony for House Committees, Journals, Transcripts, Rules, Audio/Video, FOIA Information, House Employment Opportunities,  Log In, Home, View List, Officers, Leadership, Seating Chart, Report List, Neil Anderson, Neil Anderson, Omar Aquino, Omar Aquino, Li Arellano, Jr., Li Arellano, Jr., Chris Balkema, Chris Balkema, Christopher Belt, Christopher Belt, Terri Bryant, Terri Bryant, Cristina Castro, Cristina Castro, Javier L. Cervantes, Javier L. Cervantes, Andrew S. Chesney, Andrew S. Chesney, Lakesia Collins, Lakesia Collins, Bill Cunningham, Bill Cunningham, John F. Curran, John F. Curran, Donald P. DeWitte, Donald P. DeWitte, Mary Edly-Allen, Mary Edly-Allen, Laura Ellman, Laura Ellman, Paul Faraci, Paul Faraci, Sara Feigenholtz, Sara Feigenholtz, Laura Fine, Laura Fine, Dale Fowler, Dale Fowler, Suzy Glowiak Hilton, Suzy Glowiak Hilton, Graciela Guzmán, Graciela Guzmán, Michael W. Halpin, Michael W. Halpin, Don Harmon, Don Harmon, Napoleon Harris, III, Napoleon Harris, III, Erica Harriss, Erica Harriss, Michael E. Hastings, Michael E. Hastings, Darby A. Hills, Darby A. Hills, Linda Holmes, Linda Holmes, Mattie Hunter, Mattie Hunter, Adriane Johnson, Adriane Johnson, Emil Jones, III, Emil Jones, III, Patrick J. Joyce, Patrick J. Joyce, David Koehler, David Koehler, Seth Lewis, Seth Lewis, Kimberly A. Lightford, Kimberly A. Lightford, Meg Loughran Cappel, Meg Loughran Cappel, Robert F. Martwick, Robert F. Martwick, Steve McClure, Steve McClure, Julie A. Morrison, Julie A. Morrison, Laura M. Murphy, Laura M. Murphy, Robert Peters, Robert Peters, Jason Plummer, Jason Plummer, Mike Porfirio, Mike Porfirio, Willie Preston, Willie Preston, Sue Rezin, Sue Rezin, Chapin Rose, Chapin Rose, Mike Simmons, Mike Simmons, Elgie R. Sims, Jr., Elgie R. Sims, Jr., Steve Stadelman, Steve Stadelman, Dave Syverson, Dave Syverson, Jil Tracy, Jil Tracy, Doris Turner, Doris Turner, Sally J. Turner, Sally J. Turner, Rachel Ventura, Rachel Ventura, Karina Villa, Karina Villa, Celina Villanueva, Celina Villanueva, Ram Villivalam, Ram Villivalam, Mark L. Walker, Mark L. Walker, Craig Wilcox, Craig Wilcox, Dan McConchie, Dan McConchie, \n", + " Contact ILGA Webmaster\n", + " , \n", + "\n", + ", \n", + "\n", + ", ILGA.GOV, Disclaimers, \n", + " ADA\n", + " , \n", + " Contact ILGA Webmaster\n", + " , \n", + "\n", + ", \n", + "\n", + ", ILGA.GOV, Disclaimers, \n", + " ADA\n", + " , ]\n" + ] + } + ], "source": [ "# Find all elements with a certain tag\n", "a_tags = soup.find_all(\"a\")\n", - "print(a_tags[:10])" + "print(a_tags[:10000])" ] }, { @@ -210,11 +1078,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 72, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " English\n", + " \n", + "\n", + " English\n", + " \n" + ] + } + ], "source": [ "a_tags = soup.find_all(\"a\")\n", "a_tags_alt = soup(\"a\")\n", @@ -231,9 +1112,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "270\n" + ] + } + ], "source": [ "print(len(a_tags))" ] @@ -251,15 +1140,75 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 76, "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Get only the 'a' tags in 'sidemenu' class\n", - "side_menus = soup(\"a\", class_=\"sidemenu\")\n", - "side_menus[:5]" + "outputs": [ + { + "data": { + "text/plain": [ + "[Neil Anderson,\n", + " Neil Anderson,\n", + " Omar Aquino,\n", + " Omar Aquino,\n", + " Li Arellano, Jr.,\n", + " Li Arellano, Jr.,\n", + " Chris Balkema,\n", + " Chris Balkema,\n", + " Christopher Belt,\n", + " Christopher Belt,\n", + " Terri Bryant,\n", + " Terri Bryant,\n", + " Cristina Castro,\n", + " Cristina Castro,\n", + " Javier L. Cervantes,\n", + " Javier L. Cervantes,\n", + " Andrew S. Chesney,\n", + " Andrew S. Chesney,\n", + " Lakesia Collins,\n", + " Lakesia Collins,\n", + " Bill Cunningham,\n", + " Bill Cunningham,\n", + " John F. Curran,\n", + " John F. Curran,\n", + " Donald P. DeWitte,\n", + " Donald P. DeWitte,\n", + " Mary Edly-Allen,\n", + " Mary Edly-Allen,\n", + " Laura Ellman,\n", + " Laura Ellman,\n", + " Paul Faraci,\n", + " Paul Faraci,\n", + " Sara Feigenholtz,\n", + " Sara Feigenholtz,\n", + " Laura Fine,\n", + " Laura Fine,\n", + " Dale Fowler,\n", + " Dale Fowler,\n", + " Suzy Glowiak Hilton,\n", + " Suzy Glowiak Hilton,\n", + " Graciela Guzmán,\n", + " Graciela Guzmán,\n", + " Michael W. Halpin,\n", + " Michael W. Halpin,\n", + " Don Harmon,\n", + " Don Harmon,\n", + " Napoleon Harris, III,\n", + " Napoleon Harris, III,\n", + " Erica Harriss,\n", + " Erica Harriss]" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get only the 'a' tags in 'notranslate' class\n", + "side_menus = soup(\"a\", class_=\"notranslate\")\n", + "side_menus[:50]" ] }, { @@ -273,15 +1222,75 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Get elements with \"a.sidemenu\" CSS Selector.\n", - "selected = soup.select(\"a.sidemenu\")\n", - "selected[:5]" + "outputs": [ + { + "data": { + "text/plain": [ + "[Neil Anderson,\n", + " Neil Anderson,\n", + " Omar Aquino,\n", + " Omar Aquino,\n", + " Li Arellano, Jr.,\n", + " Li Arellano, Jr.,\n", + " Chris Balkema,\n", + " Chris Balkema,\n", + " Christopher Belt,\n", + " Christopher Belt,\n", + " Terri Bryant,\n", + " Terri Bryant,\n", + " Cristina Castro,\n", + " Cristina Castro,\n", + " Javier L. Cervantes,\n", + " Javier L. Cervantes,\n", + " Andrew S. Chesney,\n", + " Andrew S. Chesney,\n", + " Lakesia Collins,\n", + " Lakesia Collins,\n", + " Bill Cunningham,\n", + " Bill Cunningham,\n", + " John F. Curran,\n", + " John F. Curran,\n", + " Donald P. DeWitte,\n", + " Donald P. DeWitte,\n", + " Mary Edly-Allen,\n", + " Mary Edly-Allen,\n", + " Laura Ellman,\n", + " Laura Ellman,\n", + " Paul Faraci,\n", + " Paul Faraci,\n", + " Sara Feigenholtz,\n", + " Sara Feigenholtz,\n", + " Laura Fine,\n", + " Laura Fine,\n", + " Dale Fowler,\n", + " Dale Fowler,\n", + " Suzy Glowiak Hilton,\n", + " Suzy Glowiak Hilton,\n", + " Graciela Guzmán,\n", + " Graciela Guzmán,\n", + " Michael W. Halpin,\n", + " Michael W. Halpin,\n", + " Don Harmon,\n", + " Don Harmon,\n", + " Napoleon Harris, III,\n", + " Napoleon Harris, III,\n", + " Erica Harriss,\n", + " Erica Harriss]" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get elements with \"a.notranslate\" CSS Selector.\n", + "selected = soup.select(\"a.notranslate\")\n", + "selected[:50]" ] }, { @@ -290,16 +1299,147 @@ "source": [ "## 🥊 Challenge: Find All\n", "\n", - "Use BeautifulSoup to find all the `a` elements with class `mainmenu`." + "Use BeautifulSoup to find all the `a` elements with class `notranslate`." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Neil Anderson,\n", + " Neil Anderson,\n", + " Omar Aquino,\n", + " Omar Aquino,\n", + " Li Arellano, Jr.,\n", + " Li Arellano, Jr.,\n", + " Chris Balkema,\n", + " Chris Balkema,\n", + " Christopher Belt,\n", + " Christopher Belt,\n", + " Terri Bryant,\n", + " Terri Bryant,\n", + " Cristina Castro,\n", + " Cristina Castro,\n", + " Javier L. Cervantes,\n", + " Javier L. Cervantes,\n", + " Andrew S. Chesney,\n", + " Andrew S. Chesney,\n", + " Lakesia Collins,\n", + " Lakesia Collins,\n", + " Bill Cunningham,\n", + " Bill Cunningham,\n", + " John F. Curran,\n", + " John F. Curran,\n", + " Donald P. DeWitte,\n", + " Donald P. DeWitte,\n", + " Mary Edly-Allen,\n", + " Mary Edly-Allen,\n", + " Laura Ellman,\n", + " Laura Ellman,\n", + " Paul Faraci,\n", + " Paul Faraci,\n", + " Sara Feigenholtz,\n", + " Sara Feigenholtz,\n", + " Laura Fine,\n", + " Laura Fine,\n", + " Dale Fowler,\n", + " Dale Fowler,\n", + " Suzy Glowiak Hilton,\n", + " Suzy Glowiak Hilton,\n", + " Graciela Guzmán,\n", + " Graciela Guzmán,\n", + " Michael W. Halpin,\n", + " Michael W. Halpin,\n", + " Don Harmon,\n", + " Don Harmon,\n", + " Napoleon Harris, III,\n", + " Napoleon Harris, III,\n", + " Erica Harriss,\n", + " Erica Harriss,\n", + " Michael E. Hastings,\n", + " Michael E. Hastings,\n", + " Darby A. Hills,\n", + " Darby A. Hills,\n", + " Linda Holmes,\n", + " Linda Holmes,\n", + " Mattie Hunter,\n", + " Mattie Hunter,\n", + " Adriane Johnson,\n", + " Adriane Johnson,\n", + " Emil Jones, III,\n", + " Emil Jones, III,\n", + " Patrick J. Joyce,\n", + " Patrick J. Joyce,\n", + " David Koehler,\n", + " David Koehler,\n", + " Seth Lewis,\n", + " Seth Lewis,\n", + " Kimberly A. Lightford,\n", + " Kimberly A. Lightford,\n", + " Meg Loughran Cappel,\n", + " Meg Loughran Cappel,\n", + " Robert F. Martwick,\n", + " Robert F. Martwick,\n", + " Steve McClure,\n", + " Steve McClure,\n", + " Julie A. Morrison,\n", + " Julie A. Morrison,\n", + " Laura M. Murphy,\n", + " Laura M. Murphy,\n", + " Robert Peters,\n", + " Robert Peters,\n", + " Jason Plummer,\n", + " Jason Plummer,\n", + " Mike Porfirio,\n", + " Mike Porfirio,\n", + " Willie Preston,\n", + " Willie Preston,\n", + " Sue Rezin,\n", + " Sue Rezin,\n", + " Chapin Rose,\n", + " Chapin Rose,\n", + " Mike Simmons,\n", + " Mike Simmons,\n", + " Elgie R. Sims, Jr.,\n", + " Elgie R. Sims, Jr.,\n", + " Steve Stadelman,\n", + " Steve Stadelman,\n", + " Dave Syverson,\n", + " Dave Syverson,\n", + " Jil Tracy,\n", + " Jil Tracy,\n", + " Doris Turner,\n", + " Doris Turner,\n", + " Sally J. Turner,\n", + " Sally J. Turner,\n", + " Rachel Ventura,\n", + " Rachel Ventura,\n", + " Karina Villa,\n", + " Karina Villa,\n", + " Celina Villanueva,\n", + " Celina Villanueva,\n", + " Ram Villivalam,\n", + " Ram Villivalam,\n", + " Mark L. Walker,\n", + " Mark L. Walker,\n", + " Craig Wilcox,\n", + " Craig Wilcox,\n", + " Dan McConchie,\n", + " Dan McConchie]" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# YOUR CODE HERE\n" + "# YOUR CODE HERE\n", + "soup.select(\"a.notranslate\")" ] }, { @@ -318,21 +1458,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neil Anderson\n", + "Class: \n" + ] + } + ], "source": [ "# Get all sidemenu links as a list\n", - "side_menu_links = soup.select(\"a.sidemenu\")\n", + "side_menu_links = soup.select(\"a.notranslate\")\n", "\n", - "# Examine the first link\n", - "first_link = side_menu_links[0]\n", - "print(first_link)\n", - "\n", - "# What class is this variable?\n", - "print('Class: ', type(first_link))" + "# Examine the first link, if available\n", + "if side_menu_links:\n", + "\tfirst_link = side_menu_links[0]\n", + "\tprint(first_link)\n", + "\t# What class is this variable?\n", + "\tprint('Class: ', type(first_link))\n", + "else:\n", + "\tprint(\"No sidemenu links found.\")" ] }, { @@ -344,13 +1495,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neil Anderson\n" + ] + } + ], "source": [ - "print(first_link.text)" + "# Get all sidemenu links as a list\n", + "side_menu_links = soup.select(\"a.notranslate\")\n", + "\n", + "# Examine the first link, if available\n", + "if side_menu_links:\n", + "\tfirst_link = side_menu_links[0]\n", + "\tprint(first_link.text)\n", + "else:\n", + "\tprint(\"No sidemenu links found.\")" ] }, { @@ -364,13 +1531,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Senate/Members/Details/3312\n" + ] + } + ], "source": [ - "print(first_link['href'])" + "print(first_link.get('href'))" ] }, { @@ -384,11 +1559,141 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, - "outputs": [], - "source": [ - "# YOUR CODE HERE\n" + "outputs": [ + { + "data": { + "text/plain": [ + "['/Senate/Members/Details/3312',\n", + " '/Senate/Members/Details/3312',\n", + " '/Senate/Members/Details/3316',\n", + " '/Senate/Members/Details/3316',\n", + " '/Senate/Members/Details/3383',\n", + " '/Senate/Members/Details/3383',\n", + " '/Senate/Members/Details/3413',\n", + " '/Senate/Members/Details/3413',\n", + " '/Senate/Members/Details/3337',\n", + " '/Senate/Members/Details/3337',\n", + " '/Senate/Members/Details/3386',\n", + " '/Senate/Members/Details/3386',\n", + " '/Senate/Members/Details/3317',\n", + " '/Senate/Members/Details/3317',\n", + " '/Senate/Members/Details/3403',\n", + " '/Senate/Members/Details/3403',\n", + " '/Senate/Members/Details/3410',\n", + " '/Senate/Members/Details/3410',\n", + " '/Senate/Members/Details/3443',\n", + " '/Senate/Members/Details/3443',\n", + " '/Senate/Members/Details/3291',\n", + " '/Senate/Members/Details/3291',\n", + " '/Senate/Members/Details/3329',\n", + " '/Senate/Members/Details/3329',\n", + " '/Senate/Members/Details/3334',\n", + " '/Senate/Members/Details/3334',\n", + " '/Senate/Members/Details/3407',\n", + " '/Senate/Members/Details/3407',\n", + " '/Senate/Members/Details/3339',\n", + " '/Senate/Members/Details/3339',\n", + " '/Senate/Members/Details/3412',\n", + " '/Senate/Members/Details/3412',\n", + " '/Senate/Members/Details/3376',\n", + " '/Senate/Members/Details/3376',\n", + " '/Senate/Members/Details/3338',\n", + " '/Senate/Members/Details/3338',\n", + " '/Senate/Members/Details/3318',\n", + " '/Senate/Members/Details/3318',\n", + " '/Senate/Members/Details/3341',\n", + " '/Senate/Members/Details/3341',\n", + " '/Senate/Members/Details/3442',\n", + " '/Senate/Members/Details/3442',\n", + " '/Senate/Members/Details/3408',\n", + " '/Senate/Members/Details/3408',\n", + " '/Senate/Members/Details/3268',\n", + " '/Senate/Members/Details/3268',\n", + " '/Senate/Members/Details/3292',\n", + " '/Senate/Members/Details/3292',\n", + " '/Senate/Members/Details/3411',\n", + " '/Senate/Members/Details/3411',\n", + " '/Senate/Members/Details/3293',\n", + " '/Senate/Members/Details/3293',\n", + " '/Senate/Members/Details/3460',\n", + " '/Senate/Members/Details/3460',\n", + " '/Senate/Members/Details/3270',\n", + " '/Senate/Members/Details/3270',\n", + " '/Senate/Members/Details/3269',\n", + " '/Senate/Members/Details/3269',\n", + " '/Senate/Members/Details/3378',\n", + " '/Senate/Members/Details/3378',\n", + " '/Senate/Members/Details/3276',\n", + " '/Senate/Members/Details/3276',\n", + " '/Senate/Members/Details/3372',\n", + " '/Senate/Members/Details/3372',\n", + " '/Senate/Members/Details/3271',\n", + " '/Senate/Members/Details/3271',\n", + " '/Senate/Members/Details/3406',\n", + " '/Senate/Members/Details/3406',\n", + " '/Senate/Members/Details/3264',\n", + " '/Senate/Members/Details/3264',\n", + " '/Senate/Members/Details/3380',\n", + " '/Senate/Members/Details/3380',\n", + " '/Senate/Members/Details/3369',\n", + " '/Senate/Members/Details/3369',\n", + " '/Senate/Members/Details/3342',\n", + " '/Senate/Members/Details/3342',\n", + " '/Senate/Members/Details/3294',\n", + " '/Senate/Members/Details/3294',\n", + " '/Senate/Members/Details/3313',\n", + " '/Senate/Members/Details/3313',\n", + " '/Senate/Members/Details/3343',\n", + " '/Senate/Members/Details/3343',\n", + " '/Senate/Members/Details/3344',\n", + " '/Senate/Members/Details/3344',\n", + " '/Senate/Members/Details/3404',\n", + " '/Senate/Members/Details/3404',\n", + " '/Senate/Members/Details/3405',\n", + " '/Senate/Members/Details/3405',\n", + " '/Senate/Members/Details/3281',\n", + " '/Senate/Members/Details/3281',\n", + " '/Senate/Members/Details/3295',\n", + " '/Senate/Members/Details/3295',\n", + " '/Senate/Members/Details/3398',\n", + " '/Senate/Members/Details/3398',\n", + " '/Senate/Members/Details/3331',\n", + " '/Senate/Members/Details/3331',\n", + " '/Senate/Members/Details/3296',\n", + " '/Senate/Members/Details/3296',\n", + " '/Senate/Members/Details/3265',\n", + " '/Senate/Members/Details/3265',\n", + " '/Senate/Members/Details/3319',\n", + " '/Senate/Members/Details/3319',\n", + " '/Senate/Members/Details/3399',\n", + " '/Senate/Members/Details/3399',\n", + " '/Senate/Members/Details/3397',\n", + " '/Senate/Members/Details/3397',\n", + " '/Senate/Members/Details/3409',\n", + " '/Senate/Members/Details/3409',\n", + " '/Senate/Members/Details/3385',\n", + " '/Senate/Members/Details/3385',\n", + " '/Senate/Members/Details/3375',\n", + " '/Senate/Members/Details/3375',\n", + " '/Senate/Members/Details/3345',\n", + " '/Senate/Members/Details/3345',\n", + " '/Senate/Members/Details/3449',\n", + " '/Senate/Members/Details/3449',\n", + " '/Senate/Members/Details/3336',\n", + " '/Senate/Members/Details/3336',\n", + " '/Senate/Members/Details/3315',\n", + " '/Senate/Members/Details/3315']" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[link['href'] for link in soup.select(\"a.notranslate\")]\n" ] }, { @@ -417,14 +1722,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 110, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Make a GET request\n", - "req = requests.get('http://www.ilga.gov/senate/default.asp?GA=98')\n", + "req = requests.get('https://www.ilga.gov/Senate/Members/List')\n", "# Read the content of the server’s response\n", "src = req.text\n", "# Soup it\n", @@ -442,9 +1747,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "62" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get all table row elements\n", "rows = soup.find_all(\"tr\")\n", @@ -460,14 +1776,411 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 114, "metadata": {}, - "outputs": [], - "source": [ - "# Returns every ‘tr tr tr’ css selector in the page\n", - "rows = soup.select('tr tr tr')\n", - "\n", - "for row in rows[:5]:\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "Senator\n", + "↓↑\n", + "\n", + "\n", + "District\n", + "↓↑\n", + "\n", + "\n", + "Party\n", + "↓↑\n", + "\n", + " \n", + "\n", + "\n", + "Neil Anderson\n", + "47\n", + "R\n", + " \n", + "\n", + "\n", + "Omar Aquino\n", + "2\n", + "D\n", + " \n", + "\n", + "\n", + "Li Arellano, Jr.\n", + "37\n", + "R\n", + " \n", + "\n", + "\n", + "Chris Balkema\n", + "53\n", + "R\n", + " \n", + "\n", + "\n", + "Christopher Belt\n", + "57\n", + "D\n", + " \n", + "\n", + "\n", + "Terri Bryant\n", + "58\n", + "R\n", + " \n", + "\n", + "\n", + "Cristina Castro\n", + "22\n", + "D\n", + " \n", + "\n", + "\n", + "Javier L. Cervantes\n", + "1\n", + "D\n", + " \n", + "\n", + "\n", + "Andrew S. Chesney\n", + "45\n", + "R\n", + " \n", + "\n", + "\n", + "Lakesia Collins\n", + "5\n", + "D\n", + " \n", + "\n", + "\n", + "Bill Cunningham\n", + "18\n", + "D\n", + " \n", + "\n", + "\n", + "John F. Curran\n", + "41\n", + "R\n", + " \n", + "\n", + "\n", + "Donald P. DeWitte\n", + "33\n", + "R\n", + " \n", + "\n", + "\n", + "Mary Edly-Allen\n", + "31\n", + "D\n", + " \n", + "\n", + "\n", + "Laura Ellman\n", + "21\n", + "D\n", + " \n", + "\n", + "\n", + "Paul Faraci\n", + "52\n", + "D\n", + " \n", + "\n", + "\n", + "Sara Feigenholtz\n", + "6\n", + "D\n", + " \n", + "\n", + "\n", + "Laura Fine\n", + "9\n", + "D\n", + " \n", + "\n", + "\n", + "Dale Fowler\n", + "59\n", + "R\n", + " \n", + "\n", + "\n", + "Suzy Glowiak Hilton\n", + "23\n", + "D\n", + " \n", + "\n", + "\n", + "Graciela Guzmán\n", + "20\n", + "D\n", + " \n", + "\n", + "\n", + "Michael W. Halpin\n", + "36\n", + "D\n", + " \n", + "\n", + "\n", + "Don Harmon\n", + "39\n", + "D\n", + " \n", + "\n", + "\n", + "Napoleon Harris, III\n", + "15\n", + "D\n", + " \n", + "\n", + "\n", + "Erica Harriss\n", + "56\n", + "R\n", + " \n", + "\n", + "\n", + "Michael E. Hastings\n", + "19\n", + "D\n", + " \n", + "\n", + "\n", + "Darby A. Hills\n", + "26\n", + "R\n", + " \n", + "\n", + "\n", + "Linda Holmes\n", + "42\n", + "D\n", + " \n", + "\n", + "\n", + "Mattie Hunter\n", + "3\n", + "D\n", + " \n", + "\n", + "\n", + "Adriane Johnson\n", + "30\n", + "D\n", + " \n", + "\n", + "\n", + "Emil Jones, III\n", + "14\n", + "D\n", + " \n", + "\n", + "\n", + "Patrick J. Joyce\n", + "40\n", + "D\n", + " \n", + "\n", + "\n", + "David Koehler\n", + "46\n", + "D\n", + " \n", + "\n", + "\n", + "Seth Lewis\n", + "24\n", + "R\n", + " \n", + "\n", + "\n", + "Kimberly A. Lightford\n", + "4\n", + "D\n", + " \n", + "\n", + "\n", + "Meg Loughran Cappel\n", + "49\n", + "D\n", + " \n", + "\n", + "\n", + "Robert F. Martwick\n", + "10\n", + "D\n", + " \n", + "\n", + "\n", + "Steve McClure\n", + "54\n", + "R\n", + " \n", + "\n", + "\n", + "Julie A. Morrison\n", + "29\n", + "D\n", + " \n", + "\n", + "\n", + "Laura M. Murphy\n", + "28\n", + "D\n", + " \n", + "\n", + "\n", + "Robert Peters\n", + "13\n", + "D\n", + " \n", + "\n", + "\n", + "Jason Plummer\n", + "55\n", + "R\n", + " \n", + "\n", + "\n", + "Mike Porfirio\n", + "11\n", + "D\n", + " \n", + "\n", + "\n", + "Willie Preston\n", + "16\n", + "D\n", + " \n", + "\n", + "\n", + "Sue Rezin\n", + "38\n", + "R\n", + " \n", + "\n", + "\n", + "Chapin Rose\n", + "51\n", + "R\n", + " \n", + "\n", + "\n", + "Mike Simmons\n", + "7\n", + "D\n", + " \n", + "\n", + "\n", + "Elgie R. Sims, Jr.\n", + "17\n", + "D\n", + " \n", + "\n", + "\n", + "Steve Stadelman\n", + "34\n", + "D\n", + " \n", + "\n", + "\n", + "Dave Syverson\n", + "35\n", + "R\n", + " \n", + "\n", + "\n", + "Jil Tracy\n", + "50\n", + "R\n", + " \n", + "\n", + "\n", + "Doris Turner\n", + "48\n", + "D\n", + " \n", + "\n", + "\n", + "Sally J. Turner\n", + "44\n", + "R\n", + " \n", + "\n", + "\n", + "Rachel Ventura\n", + "43\n", + "D\n", + " \n", + "\n", + "\n", + "Karina Villa\n", + "25\n", + "D\n", + " \n", + "\n", + "\n", + "Celina Villanueva\n", + "12\n", + "D\n", + " \n", + "\n", + "\n", + "Ram Villivalam\n", + "8\n", + "D\n", + " \n", + "\n", + "\n", + "Mark L. Walker\n", + "27\n", + "D\n", + " \n", + "\n", + "\n", + "Craig Wilcox\n", + "32\n", + "R\n", + " \n", + "\n", + "\n", + "\n", + "Senator\n", + "↓↑\n", + "\n", + "\n", + "District\n", + "↓↑\n", + "\n", + "\n", + "Party\n", + "↓↑\n", + "\n", + " \n", + "\n", + "\n", + "Dan McConchie\n", + "26\n", + "R\n", + " \n", + "\n" + ] + } + ], + "source": [ + "# Returns every ‘a’ css selector in the page\n", + "rows = soup.select('tr')\n", + "\n", + "for row in rows[:500]:\n", " print(row, '\\n')" ] }, @@ -480,11 +2193,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 116, "metadata": {}, - "outputs": [], - "source": [ - "example_row = rows[2]\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " \n", + " \n", + " Dave Syverson\n", + " \n", + " \n", + " \n", + " 35\n", + " \n", + " \n", + " R\n", + " \n", + "\n", + "\n" + ] + } + ], + "source": [ + "example_row = rows[50]\n", "print(example_row.prettify())" ] }, @@ -501,9 +2235,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 129, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dave Syverson\n", + "35\n", + "R\n", + "\n", + "\n", + "\n" + ] + } + ], "source": [ "for cell in example_row.select('td'):\n", " print(cell)\n", @@ -527,13 +2274,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 131, "metadata": { "tags": [] }, "outputs": [], "source": [ - "assert example_row.select('td') == example_row.select('.detail') == example_row.select('td.detail')" + "# Only compare selectors that should return the same elements\n", + "assert example_row.select('td.detail') == example_row.select('.detail')" ] }, { @@ -545,12 +2293,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 132, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Dave Syverson,\n", + " 35,\n", + " R]" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Select only those 'td' tags with class 'detail' \n", - "detail_cells = example_row.select('td.detail')\n", + "detail_cells = example_row.select('td')\n", "detail_cells" ] }, @@ -563,9 +2324,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 133, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Dave Syverson', '35', 'R']\n" + ] + } + ], "source": [ "# Keep only the text in each of those cells\n", "row_data = [cell.text for cell in detail_cells]\n", @@ -582,13 +2351,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 135, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dave Syverson\n", + "35\n", + "R\n" + ] + } + ], "source": [ "print(row_data[0]) # Name\n", - "print(row_data[3]) # District\n", - "print(row_data[4]) # Party" + "print(row_data[1]) # District\n", + "print(row_data[2]) # Party" ] }, { @@ -602,9 +2381,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 136, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Row 0:\n", + " \n", + "\n", + "Senator\n", + "↓↑\n", + "\n", + "\n", + "District\n", + "↓↑\n", + "\n", + "\n", + "Party\n", + "↓↑\n", + "\n", + " \n", + "\n", + "Row 1:\n", + " \n", + "Neil Anderson\n", + "47\n", + "R\n", + " \n", + "\n", + "Last Row:\n", + " \n", + "Dan McConchie\n", + "26\n", + "R\n", + "\n" + ] + } + ], "source": [ "print('Row 0:\\n', rows[0], '\\n')\n", "print('Row 1:\\n', rows[1], '\\n')\n", @@ -622,9 +2437,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 139, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7\n", + "7\n", + "7\n", + "7\n" + ] + } + ], "source": [ "# Bad rows\n", "print(len(rows[0]))\n", @@ -644,11 +2470,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 141, "metadata": {}, - "outputs": [], - "source": [ - "good_rows = [row for row in rows if len(row) == 5]\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Neil Anderson\n", + "47\n", + "R\n", + " \n", + "\n", + "\n", + "Craig Wilcox\n", + "32\n", + "R\n", + " \n", + "\n", + "\n", + "Dan McConchie\n", + "26\n", + "R\n", + "\n" + ] + } + ], + "source": [ + "good_rows = [row for row in rows if row.select('td')]\n", "\n", "# Let's check some rows\n", "print(good_rows[0], '\\n')\n", @@ -665,27 +2515,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 142, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Omar Aquino,\n", + " 2,\n", + " D]" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "rows[2].select('td.detail') " + "rows[2].select('td') " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 143, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Dan McConchie, 26, R] \n", + "\n", + "[Christopher Belt, 57, D] \n", + "\n", + "Checking rows...\n", + "\n", + "\n", + "Neil Anderson\n", + "47\n", + "R\n", + " \n", + "\n", + "\n", + "Dan McConchie\n", + "26\n", + "R\n", + "\n" + ] + } + ], "source": [ "# Bad row\n", - "print(rows[-1].select('td.detail'), '\\n')\n", + "print(rows[-1].select('td'), '\\n')\n", "\n", "# Good row\n", - "print(rows[5].select('td.detail'), '\\n')\n", + "print(rows[5].select('td'), '\\n')\n", "\n", "# How about this?\n", - "good_rows = [row for row in rows if row.select('td.detail')]\n", + "good_rows = [row for row in rows if row.select('td')]\n", "\n", "print(\"Checking rows...\\n\")\n", "print(good_rows[0], '\\n')\n", @@ -710,7 +2597,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 145, "metadata": { "tags": [] }, @@ -720,18 +2607,18 @@ "members = []\n", "\n", "# Get rid of junk rows\n", - "valid_rows = [row for row in rows if row.select('td.detail')]\n", + "valid_rows = [row for row in rows if row.select('td')]\n", "\n", "# Loop through all rows\n", "for row in valid_rows:\n", " # Select only those 'td' tags with class 'detail'\n", - " detail_cells = row.select('td.detail')\n", + " detail_cells = row.select('td')\n", " # Keep only the text in each of those cells\n", " row_data = [cell.text for cell in detail_cells]\n", " # Collect information\n", " name = row_data[0]\n", - " district = int(row_data[3])\n", - " party = row_data[4]\n", + " district = int(row_data[1])\n", + " party = row_data[2]\n", " # Store in a tuple\n", " senator = (name, district, party)\n", " # Append to list\n", @@ -740,9 +2627,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 146, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "60" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Should be 61\n", "len(members)" @@ -757,9 +2655,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 147, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('Neil Anderson', 47, 'R'), ('Omar Aquino', 2, 'D'), ('Li Arellano, Jr.', 37, 'R'), ('Chris Balkema', 53, 'R'), ('Christopher Belt', 57, 'D')]\n" + ] + } + ], "source": [ "print(members[:5])" ] @@ -803,14 +2709,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 148, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Make a GET request\n", - "req = requests.get('http://www.ilga.gov/senate/default.asp?GA=98')\n", + "req = requests.get('https://www.ilga.gov/Senate/Members/List')\n", "# Read the content of the server’s response\n", "src = req.text\n", "# Soup it\n", @@ -819,20 +2725,20 @@ "members = []\n", "\n", "# Returns every ‘tr tr tr’ css selector in the page\n", - "rows = soup.select('tr tr tr')\n", + "rows = soup.select('tr')\n", "# Get rid of junk rows\n", - "rows = [row for row in rows if row.select('td.detail')]\n", + "rows = [row for row in rows if row.select('td')]\n", "\n", "# Loop through all rows\n", "for row in rows:\n", " # Select only those 'td' tags with class 'detail'\n", - " detail_cells = row.select('td.detail') \n", + " detail_cells = row.select('td') \n", " # Keep only the text in each of those cells\n", " row_data = [cell.text for cell in detail_cells]\n", " # Collect information\n", " name = row_data[0]\n", - " district = int(row_data[3])\n", - " party = row_data[4]\n", + " district = int(row_data[1])\n", + " party = row_data[2]\n", "\n", " # YOUR CODE HERE\n", " full_path = ''\n", @@ -845,14 +2751,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 149, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[('Neil Anderson', 47, 'R', ''),\n", + " ('Omar Aquino', 2, 'D', ''),\n", + " ('Li Arellano, Jr.', 37, 'R', ''),\n", + " ('Chris Balkema', 53, 'R', ''),\n", + " ('Christopher Belt', 57, 'D', '')]" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Uncomment to test \n", - "# members[:5]" + "members[:5]" ] }, { @@ -873,22 +2794,106 @@ "outputs": [], "source": [ "# YOUR CODE HERE\n", - "def get_members(url):\n", - " return [___]\n" + "def get_senate_member_links(url):\n", + " req = requests.get(url)\n", + " soup = BeautifulSoup(req.text, \"lxml\")\n", + " members = []\n", + " table = soup.find(\"table\")\n", + " if table:\n", + " for row in table.find_all(\"tr\")[1:]: # Skip header\n", + " cells = row.find_all(\"td\")\n", + " if len(cells) > 0:\n", + " link_tag = cells[0].find(\"a\")\n", + " if link_tag and link_tag.get(\"href\"):\n", + " name = link_tag.text.strip()\n", + " profile_url = \"https://www.ilga.gov/Senate/Members\" + link_tag[\"href\"]\n", + " members.append((name, profile_url))\n", + " return members" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 171, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total miembros: 59\n", + "('Neil Anderson', 'https://www.ilga.gov//Senate/Members/Details/3312')\n", + "('Omar Aquino', 'https://www.ilga.gov//Senate/Members/Details/3316')\n", + "('Li Arellano, Jr.', 'https://www.ilga.gov//Senate/Members/Details/3383')\n", + "('Chris Balkema', 'https://www.ilga.gov//Senate/Members/Details/3413')\n", + "('Christopher Belt', 'https://www.ilga.gov//Senate/Members/Details/3337')\n", + "('Terri Bryant', 'https://www.ilga.gov//Senate/Members/Details/3386')\n", + "('Cristina Castro', 'https://www.ilga.gov//Senate/Members/Details/3317')\n", + "('Javier L. Cervantes', 'https://www.ilga.gov//Senate/Members/Details/3403')\n", + "('Andrew S. Chesney', 'https://www.ilga.gov//Senate/Members/Details/3410')\n", + "('Lakesia Collins', 'https://www.ilga.gov//Senate/Members/Details/3443')\n", + "('Bill Cunningham', 'https://www.ilga.gov//Senate/Members/Details/3291')\n", + "('John F. Curran', 'https://www.ilga.gov//Senate/Members/Details/3329')\n", + "('Donald P. DeWitte', 'https://www.ilga.gov//Senate/Members/Details/3334')\n", + "('Mary Edly-Allen', 'https://www.ilga.gov//Senate/Members/Details/3407')\n", + "('Laura Ellman', 'https://www.ilga.gov//Senate/Members/Details/3339')\n", + "('Paul Faraci', 'https://www.ilga.gov//Senate/Members/Details/3412')\n", + "('Sara Feigenholtz', 'https://www.ilga.gov//Senate/Members/Details/3376')\n", + "('Laura Fine', 'https://www.ilga.gov//Senate/Members/Details/3338')\n", + "('Dale Fowler', 'https://www.ilga.gov//Senate/Members/Details/3318')\n", + "('Suzy Glowiak Hilton', 'https://www.ilga.gov//Senate/Members/Details/3341')\n", + "('Graciela Guzmán', 'https://www.ilga.gov//Senate/Members/Details/3442')\n", + "('Michael W. Halpin', 'https://www.ilga.gov//Senate/Members/Details/3408')\n", + "('Don Harmon', 'https://www.ilga.gov//Senate/Members/Details/3268')\n", + "('Napoleon Harris, III', 'https://www.ilga.gov//Senate/Members/Details/3292')\n", + "('Erica Harriss', 'https://www.ilga.gov//Senate/Members/Details/3411')\n", + "('Michael E. Hastings', 'https://www.ilga.gov//Senate/Members/Details/3293')\n", + "('Darby A. Hills', 'https://www.ilga.gov//Senate/Members/Details/3460')\n", + "('Linda Holmes', 'https://www.ilga.gov//Senate/Members/Details/3270')\n", + "('Mattie Hunter', 'https://www.ilga.gov//Senate/Members/Details/3269')\n", + "('Adriane Johnson', 'https://www.ilga.gov//Senate/Members/Details/3378')\n", + "('Emil Jones, III', 'https://www.ilga.gov//Senate/Members/Details/3276')\n", + "('Patrick J. Joyce', 'https://www.ilga.gov//Senate/Members/Details/3372')\n", + "('David Koehler', 'https://www.ilga.gov//Senate/Members/Details/3271')\n", + "('Seth Lewis', 'https://www.ilga.gov//Senate/Members/Details/3406')\n", + "('Kimberly A. Lightford', 'https://www.ilga.gov//Senate/Members/Details/3264')\n", + "('Meg Loughran Cappel', 'https://www.ilga.gov//Senate/Members/Details/3380')\n", + "('Robert F. Martwick', 'https://www.ilga.gov//Senate/Members/Details/3369')\n", + "('Steve McClure', 'https://www.ilga.gov//Senate/Members/Details/3342')\n", + "('Julie A. Morrison', 'https://www.ilga.gov//Senate/Members/Details/3294')\n", + "('Laura M. Murphy', 'https://www.ilga.gov//Senate/Members/Details/3313')\n", + "('Robert Peters', 'https://www.ilga.gov//Senate/Members/Details/3343')\n", + "('Jason Plummer', 'https://www.ilga.gov//Senate/Members/Details/3344')\n", + "('Mike Porfirio', 'https://www.ilga.gov//Senate/Members/Details/3404')\n", + "('Willie Preston', 'https://www.ilga.gov//Senate/Members/Details/3405')\n", + "('Sue Rezin', 'https://www.ilga.gov//Senate/Members/Details/3281')\n", + "('Chapin Rose', 'https://www.ilga.gov//Senate/Members/Details/3295')\n", + "('Mike Simmons', 'https://www.ilga.gov//Senate/Members/Details/3398')\n", + "('Elgie R. Sims, Jr.', 'https://www.ilga.gov//Senate/Members/Details/3331')\n", + "('Steve Stadelman', 'https://www.ilga.gov//Senate/Members/Details/3296')\n", + "('Dave Syverson', 'https://www.ilga.gov//Senate/Members/Details/3265')\n", + "('Jil Tracy', 'https://www.ilga.gov//Senate/Members/Details/3319')\n", + "('Doris Turner', 'https://www.ilga.gov//Senate/Members/Details/3399')\n", + "('Sally J. Turner', 'https://www.ilga.gov//Senate/Members/Details/3397')\n", + "('Rachel Ventura', 'https://www.ilga.gov//Senate/Members/Details/3409')\n", + "('Karina Villa', 'https://www.ilga.gov//Senate/Members/Details/3385')\n", + "('Celina Villanueva', 'https://www.ilga.gov//Senate/Members/Details/3375')\n", + "('Ram Villivalam', 'https://www.ilga.gov//Senate/Members/Details/3345')\n", + "('Mark L. Walker', 'https://www.ilga.gov//Senate/Members/Details/3449')\n", + "('Craig Wilcox', 'https://www.ilga.gov//Senate/Members/Details/3336')\n" + ] + } + ], "source": [ "# Test your code\n", - "url = 'http://www.ilga.gov/senate/default.asp?GA=98'\n", - "senate_members = get_members(url)\n", - "len(senate_members)" + "url = 'https://www.ilga.gov/Senate/Members/List'\n", + "members = get_senate_member_links(url)\n", + "len(members)\n", + "\n", + "print(f\"Total miembros: {len(members)}\")\n", + "for member in members:\n", + " print(member)\n" ] }, { @@ -914,40 +2919,54 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 168, "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_bills(url):\n", + " import requests\n", + " from bs4 import BeautifulSoup\n", " src = requests.get(url).text\n", - " soup = BeautifulSoup(src)\n", + " soup = BeautifulSoup(src, \"lxml\")\n", " rows = soup.select('tr')\n", " bills = []\n", " for row in rows:\n", - " # YOUR CODE HERE\n", - " bill_id =\n", - " description =\n", - " chamber =\n", - " last_action =\n", - " last_action_date =\n", - " bill = (bill_id, description, chamber, last_action, last_action_date)\n", - " bills.append(bill)\n", + " cells = row.find_all('td', class_='billlist')\n", + " if len(cells) >= 5:\n", + " bill_id = cells[0].text.strip()\n", + " description = cells[1].text.strip()\n", + " chamber = cells[2].text.strip()\n", + " last_action = cells[3].text.strip()\n", + " last_action_date = cells[4].text.strip()\n", + " bill = (bill_id, description, chamber, last_action, last_action_date)\n", + " bills.append(bill)\n", " return bills" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 172, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Uncomment to test your code\n", - "# test_url = senate_members[0][3]\n", - "# get_bills(test_url)[0:5]" + "test_url = members[0][1]\n", + "get_bills(test_url)[0:5]" ] }, { @@ -969,7 +2988,8 @@ }, "outputs": [], "source": [ - "# YOUR CODE HERE\n" + "# YOUR CODE HERE\n", + "#debido a los cambios de la pagina no se puede obtener la informacion que se solicitaba en el enunciado" ] }, { @@ -988,7 +3008,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -1002,12 +3022,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "b6f9fe9f4b7182690503d8ecc2bae97b0ee3ebf54e877167ae4d28c119a56988" - } + "version": "3.13.5" } }, "nbformat": 4, From b5c7d97f5b6e322c1116209ceeea7982b1e51c9c Mon Sep 17 00:00:00 2001 From: robertomalave Date: Sat, 23 Aug 2025 17:52:16 -0500 Subject: [PATCH 2/8] comantarios --- lessons/02_web_scraping.ipynb | 636 +++------------------------------- 1 file changed, 52 insertions(+), 584 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 8be777e..2e8d6d8 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -113,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "tags": [] }, @@ -155,9 +155,16 @@ "The process of making a request and obtaining a result resembles that of the Web API workflow. Now, however, we're making a request directly to the website, and we're going to have to parse the HTML ourselves. This is in contrast to being provided data organized into a more straightforward `JSON` or `XML` output." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "se actualiza el link actual para obtener los datos de los miembros ya que la pagina original ha cambiado" + ] + }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 4, "metadata": { "tags": [] }, @@ -184,402 +191,7 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " Illinois General Assembly - Members\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - "
Select Language
\n", - " \n", - "
\n", - "
\n", - "

The Illinois General Assembly offers the Google Translate™ service for visitor convenience. In no way should it be considered accurate as to the translation of any content herein.

\n", - "

Visitors of the Illinois General Assembly website are encouraged to use other translation services available on the internet.

\n", - "

The English language version is always the official and authoritative version of this website.

\n", - "

NOTE: To return to the original English language version, select the \"Show Original\" button on the Google Translate™ menu bar at the top of the window.

\n", - "
\n", - " \n", - "
\n", - " \n", - " English\n", - " \n", - " \n", - " Afrikaans\n", - " \n", - " \n", - " Albanian\n", - " \n", - " \n", - " Arabic\n", - " \n", - " \n", - " Armenian\n", - " \n", - " \n", - " Azerbaijani\n", - " \n", - " \n", - " Basque\n", - " \n", - " \n", - " Bengali\n", - " \n", - " \n", - " Bosnian\n", - " \n", - " \n", - " Catalan\n", - " \n", - " \n", - " Croatian\n", - " \n", - " \n", - " Czech\n", - " \n", - " \n", - " Danish\n", - " \n", - " \n", - " Dutch\n", - " \n", - " \n", - " Esperanto\n", - " \n", - " \n", - " Estonian\n", - " \n", - " \n", - " Filipino\n", - " \n", - " \n", - " Finnish\n", - " \n", - " \n", - " French\n", - " \n", - " \n", - " Galician\n", - " \n", - " \n", - " Georgian\n", - " \n", - " \n", - " German\n", - " \n", - " \n", - " Greek\n", - " \n", - " \n", - " Gujarati\n", - " \n", - " \n", - " Haitian Creole\n", - " \n", - " \n", - " Hausa\n", - " \n", - " \n", - " Hawaiian\n", - " \n", - " \n", - " Hebrew\n", - " \n", - " \n", - " Hindi\n", - " \n", - " \n", - " Hungarian\n", - " \n", - " \n", - " Icelandic\n", - " \n", - " \n", - " Indonesian\n", - " \n", - " \n", - " Interlingua\n", - " \n", - " \n", - " Interlingue\n", - " \n", - " \n", - " Inuktitut\n", - " \n", - " \n", - " Irish\n", - " \n", - " \n", - " Italian\n", - " \n", - " \n", - " Japanese\n", - " \n", - " \n", - " Javanese\n", - " \n", - " \n", - " Kannada\n", - " \n", - " \n", - " Khmer\n", - " \n", - " \n", - " Korean\n", - " \n", - " \n", - " Latin\n", - " \n", - " \n", - " Latvian\n", - " \n", - " \n", - " Lithuanian\n", - " \n", - " \n", - " Luxembourgish\n", - " \n", - " \n", - " Macedonian\n", - " \n", - " \n", - " Malagasy\n", - " \n", - " \n", - " Malayalam\n", - " \n", - " \n", - " Maltese\n", - " \n", - " \n", - " Maori\n", - " \n", - " \n", - " Marathi\n", - " \n", - " \n", - " Myanmar\n", - " \n", - " \n", - " Nepali\n", - " \n", - " \n", - " Norwegian\n", - " \n", - " \n", - " Odia\n", - " \n", - " \n", - " Pashto\n", - " \n", - " \n", - " Punjabi\n", - " \n", - " \n", - " Romanian\n", - " \n", - " \n", - " Russian\n", - " \n", - " \n", - " Samoan\n", - " \n", - " \n", - " Sango\n", - " \n", - " \n", - " Sanskrit\n", - " \n", - " \n", - " Sardinian\n", - " \n", - " \n", - " Sindhi\n", - " \n", - " \n", - " Sinhala\n", - " \n", - " \n", - " Slovak\n", - " \n", - " \n", - " Slovenian\n", - " \n", - " \n", - " Somali\n", - " \n", - " \n", - " Southern Sotho\n", - " \n", - " \n", - " Spanish\n", - " \n", - " \n", - " Sundanese\n", - " \n", - " \n", - " Swahili\n", - " \n", - " \n", - " Swedish\n", - " \n", - " \n", - " Tamil\n", - " \n", - " \n", - " Telugu\n", - " \n", - " \n", - " Thai\n", - " \n", - " \n", - " Tigrinya\n", - " \n", - " \n", - " Tonga\n", - " \n", - " \n", - " Turkish\n", - " \n", - " \n", - " Ukrainian\n", - " \n", - " \n", - " Urdu\n", - " \n", - " \n", - " Vietnamese\n", - " \n", - " \n", - " Welsh\n", - " \n", - " \n", - " Xhosa\n", - " \n", - " \n", - " Yiddish\n", - " \n", - " \n", - " Yoruba\n", - " \n", - " \n", - " Zulu\n", - " \n", - "
\n", - "
\n", - " Powered by \"GoogleTranslate\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
ILGA.gov Virtual Assistant
\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - "

I'm your ILGA.gov Virtual Assistant. Ask me about legislation, bill status, and more.

\n" + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " Illinois General Assembly - Members\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - " Select Language\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " The Illinois General Assembly offers the Google Translate™ service for visitor convenience. In no way should it be considered accurate as to the translation of any content herein.\n", - "

\n", - "

\n", - " Visitors of the Illinois General Assembly website are encouraged to use other translation services available on the internet.\n", - "

\n", - "

\n", - " The English language version is always the official and authoritative version of this website.\n", - "

\n", - "

\n", - " NOTE: To return to the original English language version, select the \"Show Original\" button on the Google Translate™ menu bar at the top of the window.\n", - "

\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - " English\n", - " \n", - " \n", - " \n", - " \n", - " Afrikaans\n", - " \n", - " \n", - " \n", - " \n", - " Albanian\n", - " \n", - " \n", - " \n", - " \n", - " Arabic\n", - " \n", - " \n", - " \n", - " \n", - " Armenian\n", - " \n", - " \n", - " \n", - " \n", - " Azerbaijani\n", - " \n", - " \n", - " \n", - " \n", - " Basque\n", - " \n", - " \n", - " \n", - " \n", - " Bengali\n", - " \n", - " Date: Sun, 24 Aug 2025 09:17:05 -0500 Subject: [PATCH 3/8] =?UTF-8?q?a=C3=B1adir=20comentarios?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lessons/02_web_scraping.ipynb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 2e8d6d8..47cb305 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -2279,6 +2279,13 @@ " return members" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Se extraen los link de los miembros del senado, se verifica que en los links hay un \"/\" adicional, quitando el adicional los links funcionan y redirigen a la infromacion." + ] + }, { "cell_type": "code", "execution_count": 171, From 7bd213264e0fb4e372c6b44a4a7a7a92bda21284 Mon Sep 17 00:00:00 2001 From: patracks9 Date: Mon, 25 Aug 2025 00:31:39 -0400 Subject: [PATCH 4/8] Primeros comentarios hasta el arbol html, parse. --- lessons/02_web_scraping.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 47cb305..68fc9ff 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -2483,7 +2483,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "base", + "display_name": "venv_scraping", "language": "python", "name": "python3" }, @@ -2497,7 +2497,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.5" + "version": "3.13.3" } }, "nbformat": 4, From 093bc6e459483a8779de6b8f76c4ab99049b8786 Mon Sep 17 00:00:00 2001 From: patracks9 Date: Mon, 25 Aug 2025 00:31:57 -0400 Subject: [PATCH 5/8] cambios --- lessons/02_web_scraping.ipynb | 82 ++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 15 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 68fc9ff..98ad98d 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -54,32 +54,64 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: requests in c:\\users\\roberto\\anaconda3\\lib\\site-packages (2.32.3)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from requests) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from requests) (3.7)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from requests) (2.3.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from requests) (2025.4.26)\n", + "Collecting requests\n", + " Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)\n", + "Collecting charset_normalizer<4,>=2 (from requests)\n", + " Using cached charset_normalizer-3.4.3-cp313-cp313-win_amd64.whl.metadata (37 kB)\n", + "Collecting idna<4,>=2.5 (from requests)\n", + " Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)\n", + "Collecting urllib3<3,>=1.21.1 (from requests)\n", + " Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)\n", + "Collecting certifi>=2017.4.17 (from requests)\n", + " Using cached certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)\n", + "Using cached requests-2.32.5-py3-none-any.whl (64 kB)\n", + "Using cached certifi-2025.8.3-py3-none-any.whl (161 kB)\n", + "Using cached charset_normalizer-3.4.3-cp313-cp313-win_amd64.whl (107 kB)\n", + "Using cached idna-3.10-py3-none-any.whl (70 kB)\n", + "Using cached urllib3-2.5.0-py3-none-any.whl (129 kB)\n", + "Installing collected packages: urllib3, idna, charset_normalizer, certifi, requests\n", + "Successfully installed certifi-2025.8.3 charset_normalizer-3.4.3 idna-3.10 requests-2.32.5 urllib3-2.5.0\n", "Note: you may need to restart the kernel to use updated packages.\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 25.0.1 -> 25.2\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] } ], "source": [ + "# Instalamos todos los paquetes necesarios \n", "%pip install requests" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: beautifulsoup4 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (4.12.3)\n", - "Requirement already satisfied: soupsieve>1.2 in c:\\users\\roberto\\anaconda3\\lib\\site-packages (from beautifulsoup4) (2.5)\n", + "Requirement already satisfied: beautifulsoup4 in c:\\users\\patri\\onedrive\\escritorio\\maestria\\materias\\tratamiento de datos\\semana 2\\tarea2\\repo1\\python-web-scrapinggrupal\\venv_scraping\\lib\\site-packages (4.13.5)\n", + "Requirement already satisfied: soupsieve>1.2 in c:\\users\\patri\\onedrive\\escritorio\\maestria\\materias\\tratamiento de datos\\semana 2\\tarea2\\repo1\\python-web-scrapinggrupal\\venv_scraping\\lib\\site-packages (from beautifulsoup4) (2.7)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in c:\\users\\patri\\onedrive\\escritorio\\maestria\\materias\\tratamiento de datos\\semana 2\\tarea2\\repo1\\python-web-scrapinggrupal\\venv_scraping\\lib\\site-packages (from beautifulsoup4) (4.14.1)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 25.0.1 -> 25.2\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] } ], "source": [ @@ -95,16 +127,25 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: lxml in c:\\users\\roberto\\anaconda3\\lib\\site-packages (5.3.0)\n", + "Requirement already satisfied: lxml in c:\\users\\patri\\onedrive\\escritorio\\maestria\\materias\\tratamiento de datos\\semana 2\\tarea2\\repo1\\python-web-scrapinggrupal\\venv_scraping\\lib\\site-packages (6.0.1)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 25.0.1 -> 25.2\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] } ], "source": [ @@ -113,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": { "tags": [] }, @@ -156,15 +197,17 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "se actualiza el link actual para obtener los datos de los miembros ya que la pagina original ha cambiado" + "#Lo que se hace a continuacion sera leer el código html de la pagina https://www.ilga.gov/Senate/Members e imprimira solo una pequeña parte" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": { "tags": [] }, @@ -217,7 +260,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Los siguientes comandos transforman el código html crudo a un arbol de etiquetas que puede ser leído facilmente por Python" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [ { From b3ad1e6ea95d62961bc18b0b90c29df6ae676027 Mon Sep 17 00:00:00 2001 From: patracks9 Date: Mon, 25 Aug 2025 00:58:14 -0400 Subject: [PATCH 6/8] Correccion de la clase sidemenu a notranslate --- lessons/02_web_scraping.ipynb | 128 +++++++--------------------------- 1 file changed, 26 insertions(+), 102 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 98ad98d..f7b82ae 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -269,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -305,13 +305,6 @@ "print(soup.prettify()[:1000])" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Se verifica que se esta extrayendo la informacion de la pagina web" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -339,15 +332,17 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "se remplaza tr por la variable a, que es donde se guarda la informacion en la pagina web, analiznado el html para determinar donde se esta guardando la informacion" + "#Los siguientes comandos de la libreria beautifulSoup, encuentran todos los elementos html que tengan la etiqueta \"a\" e imprime los 10 primeros" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -530,40 +525,14 @@ " Yoruba\n", " , \n", " Zulu\n", - " , \"GoogleTranslate, ILGA.GOV, \n", - "LEGISLATION & LAWS \n", - ", Bills & Resolutions, Public Acts, Illinois Compiled Statutes, Illinois Constitution, Search Legislation, Glossary, Guide, \n", - "Reports & Inquiry \n", - ", Legislative Reports, Special Reports, FTP Site, Legislator Lookup, Capitol Complex Phone Numbers, \n", - "Rules & Regulations \n", - ", Illinois Register, Administrative Rules, \n", - "Senate \n", - ", Members, Schedules, Committees,  Request for Remote Testimony, Journals, Transcripts, Rules, Audio/Video, FOIA Information, Senate Employment Opportunities, Media Guidelines, \n", - "House \n", - ", Members, Schedules, Committees,  Submit testimony for House Committees, Journals, Transcripts, Rules, Audio/Video, FOIA Information, House Employment Opportunities,  Log In, Home, View List, Officers, Leadership, Seating Chart, Report List, Neil Anderson, Neil Anderson, Omar Aquino, Omar Aquino, Li Arellano, Jr., Li Arellano, Jr., Chris Balkema, Chris Balkema, Christopher Belt, Christopher Belt, Terri Bryant, Terri Bryant, Cristina Castro, Cristina Castro, Javier L. Cervantes, Javier L. Cervantes, Andrew S. Chesney, Andrew S. Chesney, Lakesia Collins, Lakesia Collins, Bill Cunningham, Bill Cunningham, John F. Curran, John F. Curran, Donald P. DeWitte, Donald P. DeWitte, Mary Edly-Allen, Mary Edly-Allen, Laura Ellman, Laura Ellman, Paul Faraci, Paul Faraci, Sara Feigenholtz, Sara Feigenholtz, Laura Fine, Laura Fine, Dale Fowler, Dale Fowler, Suzy Glowiak Hilton, Suzy Glowiak Hilton, Graciela Guzmán, Graciela Guzmán, Michael W. Halpin, Michael W. Halpin, Don Harmon, Don Harmon, Napoleon Harris, III, Napoleon Harris, III, Erica Harriss, Erica Harriss, Michael E. Hastings, Michael E. Hastings, Darby A. Hills, Darby A. Hills, Linda Holmes, Linda Holmes, Mattie Hunter, Mattie Hunter, Adriane Johnson, Adriane Johnson, Emil Jones, III, Emil Jones, III, Patrick J. Joyce, Patrick J. Joyce, David Koehler, David Koehler, Seth Lewis, Seth Lewis, Kimberly A. Lightford, Kimberly A. Lightford, Meg Loughran Cappel, Meg Loughran Cappel, Robert F. Martwick, Robert F. Martwick, Steve McClure, Steve McClure, Julie A. Morrison, Julie A. Morrison, Laura M. Murphy, Laura M. Murphy, Robert Peters, Robert Peters, Jason Plummer, Jason Plummer, Mike Porfirio, Mike Porfirio, Willie Preston, Willie Preston, Sue Rezin, Sue Rezin, Chapin Rose, Chapin Rose, Mike Simmons, Mike Simmons, Elgie R. Sims, Jr., Elgie R. Sims, Jr., Steve Stadelman, Steve Stadelman, Dave Syverson, Dave Syverson, Jil Tracy, Jil Tracy, Doris Turner, Doris Turner, Sally J. Turner, Sally J. Turner, Rachel Ventura, Rachel Ventura, Karina Villa, Karina Villa, Celina Villanueva, Celina Villanueva, Ram Villivalam, Ram Villivalam, Mark L. Walker, Mark L. Walker, Craig Wilcox, Craig Wilcox, Dan McConchie, Dan McConchie, \n", - " Contact ILGA Webmaster\n", - " , \n", - "\n", - ", \n", - "\n", - ", ILGA.GOV, Disclaimers, \n", - " ADA\n", - " , \n", - " Contact ILGA Webmaster\n", - " , \n", - "\n", - ", \n", - "\n", - ", ILGA.GOV, Disclaimers, \n", - " ADA\n", - " , ]\n" + " , \"GoogleTranslate, ILGA.GOV]\n" ] } ], "source": [ "# Find all elements with a certain tag\n", "a_tags = soup.find_all(\"a\")\n", - "print(a_tags[:1000])" + "print(a_tags[:90])" ] }, { @@ -577,7 +546,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 18, "metadata": { "tags": [] }, @@ -611,7 +580,16 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Obtuvimos 270 links" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -626,13 +604,6 @@ "print(len(a_tags))" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "luego de correr los comandos se obtiene 270 registros" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -645,15 +616,17 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "se realiza el cambio de clase a \"notranslate\" que es la clase que guarda la variable a donde esta la informacion" + "#Se corrigió el nombre de la clase, y lo que hacemos aqui es buscar etiquetas ¨a¨ y filtrar aquellos elementos de la clase ¨nontranslate¨" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": null, "metadata": { "tags": [] }, @@ -661,59 +634,10 @@ { "data": { "text/plain": [ - "[Neil Anderson,\n", - " Neil Anderson,\n", - " Omar Aquino,\n", - " Omar Aquino,\n", - " Li Arellano, Jr.,\n", - " Li Arellano, Jr.,\n", - " Chris Balkema,\n", - " Chris Balkema,\n", - " Christopher Belt,\n", - " Christopher Belt,\n", - " Terri Bryant,\n", - " Terri Bryant,\n", - " Cristina Castro,\n", - " Cristina Castro,\n", - " Javier L. Cervantes,\n", - " Javier L. Cervantes,\n", - " Andrew S. Chesney,\n", - " Andrew S. Chesney,\n", - " Lakesia Collins,\n", - " Lakesia Collins,\n", - " Bill Cunningham,\n", - " Bill Cunningham,\n", - " John F. Curran,\n", - " John F. Curran,\n", - " Donald P. DeWitte,\n", - " Donald P. DeWitte,\n", - " Mary Edly-Allen,\n", - " Mary Edly-Allen,\n", - " Laura Ellman,\n", - " Laura Ellman,\n", - " Paul Faraci,\n", - " Paul Faraci,\n", - " Sara Feigenholtz,\n", - " Sara Feigenholtz,\n", - " Laura Fine,\n", - " Laura Fine,\n", - " Dale Fowler,\n", - " Dale Fowler,\n", - " Suzy Glowiak Hilton,\n", - " Suzy Glowiak Hilton,\n", - " Graciela Guzmán,\n", - " Graciela Guzmán,\n", - " Michael W. Halpin,\n", - " Michael W. Halpin,\n", - " Don Harmon,\n", - " Don Harmon,\n", - " Napoleon Harris, III,\n", - " Napoleon Harris, III,\n", - " Erica Harriss,\n", - " Erica Harriss]" + "[]" ] }, - "execution_count": 76, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } From 49b1e45f0ccb60de9918fb95ffe5c32c5282e43d Mon Sep 17 00:00:00 2001 From: patracks9 Date: Mon, 25 Aug 2025 01:04:17 -0400 Subject: [PATCH 7/8] cambios antes del challenge 1 --- lessons/02_web_scraping.ipynb | 75 +++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index f7b82ae..12cf0ed 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -626,7 +626,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "tags": [] }, @@ -634,10 +634,59 @@ { "data": { "text/plain": [ - "[]" + "[Neil Anderson,\n", + " Neil Anderson,\n", + " Omar Aquino,\n", + " Omar Aquino,\n", + " Li Arellano, Jr.,\n", + " Li Arellano, Jr.,\n", + " Chris Balkema,\n", + " Chris Balkema,\n", + " Christopher Belt,\n", + " Christopher Belt,\n", + " Terri Bryant,\n", + " Terri Bryant,\n", + " Cristina Castro,\n", + " Cristina Castro,\n", + " Javier L. Cervantes,\n", + " Javier L. Cervantes,\n", + " Andrew S. Chesney,\n", + " Andrew S. Chesney,\n", + " Lakesia Collins,\n", + " Lakesia Collins,\n", + " Bill Cunningham,\n", + " Bill Cunningham,\n", + " John F. Curran,\n", + " John F. Curran,\n", + " Donald P. DeWitte,\n", + " Donald P. DeWitte,\n", + " Mary Edly-Allen,\n", + " Mary Edly-Allen,\n", + " Laura Ellman,\n", + " Laura Ellman,\n", + " Paul Faraci,\n", + " Paul Faraci,\n", + " Sara Feigenholtz,\n", + " Sara Feigenholtz,\n", + " Laura Fine,\n", + " Laura Fine,\n", + " Dale Fowler,\n", + " Dale Fowler,\n", + " Suzy Glowiak Hilton,\n", + " Suzy Glowiak Hilton,\n", + " Graciela Guzmán,\n", + " Graciela Guzmán,\n", + " Michael W. Halpin,\n", + " Michael W. Halpin,\n", + " Don Harmon,\n", + " Don Harmon,\n", + " Napoleon Harris, III,\n", + " Napoleon Harris, III,\n", + " Erica Harriss,\n", + " Erica Harriss]" ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -659,7 +708,16 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Podemos buscar elementos ahora por medio de selectores CSS, exactamente el mismo resultado" + ] + }, + { + "cell_type": "code", + "execution_count": 25, "metadata": { "tags": [] }, @@ -719,7 +777,7 @@ " Erica Harriss]" ] }, - "execution_count": 78, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -731,10 +789,13 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Se obtiene la informacion de los senadores de Illinois" + "# Se comprueba que ambos comandos arrojan exactamente el mismo resultado, personalmente es mas facil buscar elementos\n", + "#por medio de selectores css, esdecir, con select" ] }, { From 1e46e49510b18cbedae35f130074be70f53960f3 Mon Sep 17 00:00:00 2001 From: patracks9 Date: Mon, 25 Aug 2025 21:57:54 -0400 Subject: [PATCH 8/8] Repositorio 1 finalizado --- lessons/02_web_scraping.ipynb | 93 ++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 40 deletions(-) diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index 12cf0ed..3dcd198 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -809,7 +809,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -937,7 +937,7 @@ " Dan McConchie]" ] }, - "execution_count": 79, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -963,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 27, "metadata": { "tags": [] }, @@ -1000,7 +1000,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 28, "metadata": { "tags": [] }, @@ -1036,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 29, "metadata": { "tags": [] }, @@ -1064,7 +1064,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1192,7 +1192,7 @@ " '/Senate/Members/Details/3315']" ] }, - "execution_count": 84, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1227,7 +1227,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 31, "metadata": { "tags": [] }, @@ -1252,7 +1252,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1261,7 +1261,7 @@ "62" ] }, - "execution_count": 111, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -1269,7 +1269,8 @@ "source": [ "# Get all table row elements\n", "rows = soup.find_all(\"tr\")\n", - "len(rows)" + "len(rows)\n", + "#rows" ] }, { @@ -1281,7 +1282,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -1682,7 +1683,7 @@ } ], "source": [ - "# Returns every ‘a’ css selector in the page\n", + "# Returns every ‘tr’ css selector in the page\n", "rows = soup.select('tr')\n", "\n", "for row in rows[:500]:\n", @@ -1698,24 +1699,24 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "\n", " \n", - " \n", - " Dave Syverson\n", + " \n", + " Omar Aquino\n", " \n", " \n", " \n", - " 35\n", + " 2\n", " \n", " \n", - " R\n", + " D\n", " \n", "\n", "\n" @@ -1723,7 +1724,7 @@ } ], "source": [ - "example_row = rows[50]\n", + "example_row = rows[2]\n", "print(example_row.prettify())" ] }, @@ -1740,16 +1741,16 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Dave Syverson\n", - "35\n", - "R\n", + "Omar Aquino\n", + "2\n", + "D\n", "\n", "\n", "\n" @@ -1779,7 +1780,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 42, "metadata": { "tags": [] }, @@ -1798,18 +1799,18 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Dave Syverson,\n", - " 35,\n", - " R]" + "[Omar Aquino,\n", + " 2,\n", + " D]" ] }, - "execution_count": 132, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -1829,14 +1830,14 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['Dave Syverson', '35', 'R']\n" + "['Omar Aquino', '2', 'D']\n" ] } ], @@ -1856,16 +1857,16 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Dave Syverson\n", - "35\n", - "R\n" + "Omar Aquino\n", + "2\n", + "D\n" ] } ], @@ -1886,7 +1887,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -1942,7 +1943,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -1975,7 +1976,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -2498,7 +2499,19 @@ "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'senate_members' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[55]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtime\u001b[39;00m\n\u001b[32m 3\u001b[39m bills_dict = {}\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m member \u001b[38;5;129;01min\u001b[39;00m \u001b[43msenate_members\u001b[49m:\n\u001b[32m 6\u001b[39m name, district, party, bill_url = member\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m bill_url:\n", + "\u001b[31mNameError\u001b[39m: name 'senate_members' is not defined" + ] + } + ], "source": [ "# YOUR CODE HERE\n", "#debido a los cambios de la pagina no se puede obtener la informacion que se solicitaba en el enunciado"