diff --git a/lessons/02_web_scraping.ipynb b/lessons/02_web_scraping.ipynb index c8945b6..acd5906 100644 --- a/lessons/02_web_scraping.ipynb +++ b/lessons/02_web_scraping.ipynb @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -128,18 +128,20 @@ } ], "source": [ + "# 🧩 El comando %pip install lxml instala la librerÃa lxml en tu entorno de Jupyter Notebook.\n", + "# âš¡ lxml es un parser rápido y eficiente para analizar y procesar archivos HTML y XML, muy útil para usar con Beautiful Soup en web scraping.\n", "%pip install lxml" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 23, "metadata": { "tags": [] }, "outputs": [], "source": [ - "# Import required libraries\n", + "# importamos las librerÃas necesarias\n", "from bs4 import BeautifulSoup\n", "from datetime import datetime\n", "import requests\n", @@ -152,45 +154,60 @@ "source": [ "\n", "\n", - "# Extracting and Parsing HTML \n", + "# Extracción y análisis de HTML\n", "\n", - "In order to succesfully scrape and analyse HTML, we'll be going through the following 4 steps:\n", - "1. Make a GET request\n", - "2. Parse the page with Beautiful Soup\n", - "3. Search for HTML elements\n", - "4. Get attributes and text of these elements" + "Para extraer y analizar correctamente HTML, seguiremos los siguientes 4 pasos:\n", + "1. Realizar una solicitud GET\n", + "2. Analizar la página con Beautiful Soup\n", + "3. Buscar elementos HTML\n", + "4. Obtener atributos y texto de estos elementos" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step 1: Make a GET Request to Obtain a Page's HTML\n", + "## Paso 1: Realiza una solicitud GET para obtener el HTML de una página\n", + "\n", + "Podemos usar la librerÃa Requests para:\n", "\n", - "We can use the Requests library to:\n", + "1. Realizar una solicitud GET a la página, y\n", "\n", - "1. Make a GET request to the page, and\n", - "2. Read in the webpage's HTML code.\n", + "2. Leer el código HTML de la página web.\n", "\n", - "The process of making a request and obtaining a result resembles that of the Web API workflow. Now, however, we're making a request directly to the website, and we're going to have to parse the HTML ourselves. This is in contrast to being provided data organized into a more straightforward `JSON` or `XML` output." + "El proceso de realizar una solicitud y obtener un resultado se asemeja al flujo de trabajo de una API web. Sin embargo, en este caso estamos haciendo la solicitud directamente al sitio web y tendremos que analizar el HTML por nuestra cuenta. Esto es diferente a cuando se nos proporciona la información ya organizada en un formato más sencillo como JSON o XML." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 24, "metadata": { "tags": [] }, "outputs": [ { - "ename": "NameError", - "evalue": "name 'requests' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Make a GET request\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m req = \u001b[43mrequests\u001b[49m.get(\u001b[33m'\u001b[39m\u001b[33mhttp://www.ilga.gov/senate/default.asp\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# Read the content of the server’s response\u001b[39;00m\n\u001b[32m 4\u001b[39m src = req.text\n", - "\u001b[31mNameError\u001b[39m: name 'requests' is not defined" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " English\n", + " , \n", + " Afrikaans\n", + " , \n", + " Albanian\n", + " , \n", + " Arabic\n", + " , \n", + " Armenian\n", + " , \n", + " Azerbaijani\n", + " , \n", + " Basque\n", + " , \n", + " Bengali\n", + " , \n", + " Bosnian\n", + " , \n", + " Catalan\n", + " ]\n" + ] + } + ], "source": [ "# Find all elements with a certain tag\n", "a_tags = soup.find_all(\"a\")\n", @@ -274,11 +345,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " English\n", + " \n", + "\n", + " English\n", + " \n" + ] + } + ], "source": [ "a_tags = soup.find_all(\"a\")\n", "a_tags_alt = soup(\"a\")\n", @@ -295,9 +379,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "270\n" + ] + } + ], "source": [ "print(len(a_tags))" ] @@ -315,11 +407,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get only the 'a' tags in 'sidemenu' class\n", "side_menus = soup(\"a\", class_=\"sidemenu\")\n", @@ -337,11 +440,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get elements with \"a.sidemenu\" CSS Selector.\n", "selected = soup.select(\"a.sidemenu\")\n", @@ -359,7 +473,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -382,11 +496,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIndexError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[32]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 2\u001b[39m side_menu_links = soup.select(\u001b[33m\"\u001b[39m\u001b[33ma.sidemenu\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# Examine the first link\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m first_link = \u001b[43mside_menu_links\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[38;5;28mprint\u001b[39m(first_link)\n\u001b[32m 8\u001b[39m \u001b[38;5;66;03m# What class is this variable?\u001b[39;00m\n", + "\u001b[31mIndexError\u001b[39m: list index out of range" + ] + } + ], "source": [ "# Get all sidemenu links as a list\n", "side_menu_links = soup.select(\"a.sidemenu\")\n",