From efedd5fd8ceb9671553690e7bf592334f49adfa7 Mon Sep 17 00:00:00 2001 From: SparshRastogi <75373475+SparshRastogi@users.noreply.github.com> Date: Mon, 24 Oct 2022 10:18:55 +0530 Subject: [PATCH 01/13] Create fetch_amazon_product_data.py This file provides a function which will take a product name as input from the user,and fetch the necessary information about that kind of products from Amazon like the product title,link to that product,price of the product,the ratings of the product and the discount available on the product in the form of a csv file,this will help the users by improving searchability and navigability and find the right product easily and in a short period of time, it will also be beneficial for performing better analysis on products --- web_programming/fetch_amazon_product_data.py | 52 ++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 web_programming/fetch_amazon_product_data.py diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py new file mode 100644 index 000000000000..9ebc1e77a93d --- /dev/null +++ b/web_programming/fetch_amazon_product_data.py @@ -0,0 +1,52 @@ +''' This file provides a function which will take +a product name as input from the user,and fetch the necessary +information about that kind of products from Amazon like the product +title,link to that product,price of the product,the ratings of +the product and the discount available on the product +in the form of a csv file,this will help the users by improving searchability +and navigability and find the right product easily and in a short period of time, +it will also be beneficial for performing better analysis on products''' + + + + +import pandas as pd +from bs4 import BeautifulSoup as bs +import requests +import itertools as it + + + + +def get_product_info(product = 'laptop'): #function that will take the product as input and return the product details as output in the form of a csv file,if no input is given,it will fetch the details of laptop by default + url = f'https://www.amazon.in/laptop/s?k={product}' #generation of search query url + header = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US, en;q=0.5'}) #header that will indicate to the destination server that the request is coming from a genuine human being and not a bot + page = requests.get(url,headers = header) + page_content = page.text + soup =bs(page_content) + data = pd.DataFrame(columns = ['Product Title','Product Link','Current Price of the product','Product Rating','MRP of the product','Discount']) #initializing a pandas dataframe to store the requisite information + for i, j in it.zip_longest(soup.find_all("div",attrs={"class": 's-result-item','data-component-type': 's-search-result'}),soup.find_all("div",attrs={"class": 'a-row a-size-base a-color-base'})): #for loop to parse through each entry and store them in the dataframe along with try....except block for handling exceptions that may arise + try: + product_title = i.h2.text + product_link = 'https://www.amazon.in/' + i.h2.a['href'] + product_price = i.find("span",attrs={"class": 'a-offscreen'}).text + try: + product_rating = i.find('span',attrs={"class": 'a-icon-alt'}).text + except AttributeError: + product_rating = 'Not available' + try: + product_mrp = '₹' + i.find("span",attrs={"class": 'a-price a-text-price'}).text.split('₹')[1] + except AttributeError: + product_mrp = '' + try: + discount = float(((float(product_mrp.strip('₹').replace(',',''))-float(product_price.strip('₹').replace(',','')))/float(product_mrp.strip('₹').replace(',','')))*100) + except ValueError: + discount = '' + except AttributeError: + pass + data.loc[len(data.index)] = [product_title,product_link,product_price,product_rating,product_mrp,discount] + data.loc[data['Current Price of the product'] > data['MRP of the product'], 'MRP of the product'] = ' ' + data.loc[data['Current Price of the product'] > data['MRP of the product'], 'Discount'] = ' ' + data.index += 1 + data.to_csv(f'Amazon Product Data({product}).csv') #writing the data to the csv file + return data From 345dbbd91a6a9878c8648da524b5aff5768afff0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Oct 2022 04:52:30 +0000 Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/fetch_amazon_product_data.py | 139 +++++++++++++------ 1 file changed, 93 insertions(+), 46 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index 9ebc1e77a93d..97052cf19884 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -1,52 +1,99 @@ -''' This file provides a function which will take -a product name as input from the user,and fetch the necessary -information about that kind of products from Amazon like the product -title,link to that product,price of the product,the ratings of -the product and the discount available on the product -in the form of a csv file,this will help the users by improving searchability -and navigability and find the right product easily and in a short period of time, -it will also be beneficial for performing better analysis on products''' +""" This file provides a function which will take +a product name as input from the user,and fetch the necessary +information about that kind of products from Amazon like the product +title,link to that product,price of the product,the ratings of +the product and the discount available on the product +in the form of a csv file,this will help the users by improving searchability +and navigability and find the right product easily and in a short period of time, +it will also be beneficial for performing better analysis on products""" +import itertools as it +import pandas as pd +import requests +from bs4 import BeautifulSoup as bs -import pandas as pd -from bs4 import BeautifulSoup as bs -import requests -import itertools as it - - - -def get_product_info(product = 'laptop'): #function that will take the product as input and return the product details as output in the form of a csv file,if no input is given,it will fetch the details of laptop by default - url = f'https://www.amazon.in/laptop/s?k={product}' #generation of search query url - header = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US, en;q=0.5'}) #header that will indicate to the destination server that the request is coming from a genuine human being and not a bot - page = requests.get(url,headers = header) - page_content = page.text - soup =bs(page_content) - data = pd.DataFrame(columns = ['Product Title','Product Link','Current Price of the product','Product Rating','MRP of the product','Discount']) #initializing a pandas dataframe to store the requisite information - for i, j in it.zip_longest(soup.find_all("div",attrs={"class": 's-result-item','data-component-type': 's-search-result'}),soup.find_all("div",attrs={"class": 'a-row a-size-base a-color-base'})): #for loop to parse through each entry and store them in the dataframe along with try....except block for handling exceptions that may arise - try: - product_title = i.h2.text - product_link = 'https://www.amazon.in/' + i.h2.a['href'] - product_price = i.find("span",attrs={"class": 'a-offscreen'}).text - try: - product_rating = i.find('span',attrs={"class": 'a-icon-alt'}).text - except AttributeError: - product_rating = 'Not available' - try: - product_mrp = '₹' + i.find("span",attrs={"class": 'a-price a-text-price'}).text.split('₹')[1] - except AttributeError: - product_mrp = '' - try: - discount = float(((float(product_mrp.strip('₹').replace(',',''))-float(product_price.strip('₹').replace(',','')))/float(product_mrp.strip('₹').replace(',','')))*100) - except ValueError: - discount = '' - except AttributeError: - pass - data.loc[len(data.index)] = [product_title,product_link,product_price,product_rating,product_mrp,discount] - data.loc[data['Current Price of the product'] > data['MRP of the product'], 'MRP of the product'] = ' ' - data.loc[data['Current Price of the product'] > data['MRP of the product'], 'Discount'] = ' ' - data.index += 1 - data.to_csv(f'Amazon Product Data({product}).csv') #writing the data to the csv file +def get_product_info( + product="laptop", +): # function that will take the product as input and return the product details as output in the form of a csv file,if no input is given,it will fetch the details of laptop by default + url = ( + f"https://www.amazon.in/laptop/s?k={product}" # generation of search query url + ) + header = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", + "Accept-Language": "en-US, en;q=0.5", + } # header that will indicate to the destination server that the request is coming from a genuine human being and not a bot + page = requests.get(url, headers=header) + page_content = page.text + soup = bs(page_content) + data = pd.DataFrame( + columns=[ + "Product Title", + "Product Link", + "Current Price of the product", + "Product Rating", + "MRP of the product", + "Discount", + ] + ) # initializing a pandas dataframe to store the requisite information + for i, j in it.zip_longest( + soup.find_all( + "div", + attrs={"class": "s-result-item", "data-component-type": "s-search-result"}, + ), + soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}), + ): # for loop to parse through each entry and store them in the dataframe along with try....except block for handling exceptions that may arise + try: + product_title = i.h2.text + product_link = "https://www.amazon.in/" + i.h2.a["href"] + product_price = i.find("span", attrs={"class": "a-offscreen"}).text + try: + product_rating = i.find("span", attrs={"class": "a-icon-alt"}).text + except AttributeError: + product_rating = "Not available" + try: + product_mrp = ( + "₹" + + i.find( + "span", attrs={"class": "a-price a-text-price"} + ).text.split("₹")[1] + ) + except AttributeError: + product_mrp = "" + try: + discount = float( + ( + ( + float(product_mrp.strip("₹").replace(",", "")) + - float(product_price.strip("₹").replace(",", "")) + ) + / float(product_mrp.strip("₹").replace(",", "")) + ) + * 100 + ) + except ValueError: + discount = "" + except AttributeError: + pass + data.loc[len(data.index)] = [ + product_title, + product_link, + product_price, + product_rating, + product_mrp, + discount, + ] + data.loc[ + data["Current Price of the product"] > data["MRP of the product"], + "MRP of the product", + ] = " " + data.loc[ + data["Current Price of the product"] > data["MRP of the product"], "Discount" + ] = " " + data.index += 1 + data.to_csv( + f"Amazon Product Data({product}).csv" + ) # writing the data to the csv file return data From 350fd28e0cba65d3cd36c1510a44942c4d03c445 Mon Sep 17 00:00:00 2001 From: SparshRastogi <75373475+SparshRastogi@users.noreply.github.com> Date: Thu, 27 Oct 2022 21:26:54 +0530 Subject: [PATCH 03/13] Update fetch_amazon_product_data.py Added type hints and modified files to pass precommit test --- web_programming/fetch_amazon_product_data.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index 97052cf19884..2bdccc216a51 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -15,16 +15,17 @@ from bs4 import BeautifulSoup as bs -def get_product_info( - product="laptop", -): # function that will take the product as input and return the product details as output in the form of a csv file,if no input is given,it will fetch the details of laptop by default +def get_product_info(product="laptop") -> None: +# function that will take the product as input and return the product details as output +#in the form of a csv file,if no input is given,it will fetch the details of laptop by default url = ( f"https://www.amazon.in/laptop/s?k={product}" # generation of search query url ) header = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", +"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", "Accept-Language": "en-US, en;q=0.5", - } # header that will indicate to the destination server that the request is coming from a genuine human being and not a bot + } # header that will indicate to the destination server that the request is coming from + #a genuine human being and not a bot page = requests.get(url, headers=header) page_content = page.text soup = bs(page_content) @@ -44,7 +45,8 @@ def get_product_info( attrs={"class": "s-result-item", "data-component-type": "s-search-result"}, ), soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}), - ): # for loop to parse through each entry and store them in the dataframe along with try....except block for handling exceptions that may arise + ): # for loop to parse through each entry and store them in the dataframe along with try.... + #except block for handling exceptions that may arise try: product_title = i.h2.text product_link = "https://www.amazon.in/" + i.h2.a["href"] @@ -96,4 +98,4 @@ def get_product_info( data.to_csv( f"Amazon Product Data({product}).csv" ) # writing the data to the csv file - return data + From eed46e2790f10e7f7c553d6de5a5c07e7797afa0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 27 Oct 2022 15:57:53 +0000 Subject: [PATCH 04/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/fetch_amazon_product_data.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index 2bdccc216a51..b8b01777c0af 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -15,17 +15,17 @@ from bs4 import BeautifulSoup as bs -def get_product_info(product="laptop") -> None: -# function that will take the product as input and return the product details as output -#in the form of a csv file,if no input is given,it will fetch the details of laptop by default +def get_product_info(product="laptop") -> None: + # function that will take the product as input and return the product details as output + # in the form of a csv file,if no input is given,it will fetch the details of laptop by default url = ( f"https://www.amazon.in/laptop/s?k={product}" # generation of search query url ) header = { -"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", "Accept-Language": "en-US, en;q=0.5", - } # header that will indicate to the destination server that the request is coming from - #a genuine human being and not a bot + } # header that will indicate to the destination server that the request is coming from + # a genuine human being and not a bot page = requests.get(url, headers=header) page_content = page.text soup = bs(page_content) @@ -46,7 +46,7 @@ def get_product_info(product="laptop") -> None: ), soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}), ): # for loop to parse through each entry and store them in the dataframe along with try.... - #except block for handling exceptions that may arise + # except block for handling exceptions that may arise try: product_title = i.h2.text product_link = "https://www.amazon.in/" + i.h2.a["href"] @@ -98,4 +98,3 @@ def get_product_info(product="laptop") -> None: data.to_csv( f"Amazon Product Data({product}).csv" ) # writing the data to the csv file - From 2e4134a17a5e28efa9fb7c8ea86969b2cf21b4da Mon Sep 17 00:00:00 2001 From: SparshRastogi <75373475+SparshRastogi@users.noreply.github.com> Date: Fri, 28 Oct 2022 09:20:00 +0530 Subject: [PATCH 05/13] Update fetch_amazon_product_data.py Added type hints and made changes to pass the precommit --- web_programming/fetch_amazon_product_data.py | 25 ++++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index b8b01777c0af..8d617e40bdb3 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -15,17 +15,21 @@ from bs4 import BeautifulSoup as bs -def get_product_info(product="laptop") -> None: - # function that will take the product as input and return the product details as output - # in the form of a csv file,if no input is given,it will fetch the details of laptop by default +def get_product_info(product:str = "laptop") -> None: + # function that will take the product as input and return the + #product details as output + # in the form of a csv file,if no input is given,it + #will fetch the details of laptop by default url = ( - f"https://www.amazon.in/laptop/s?k={product}" # generation of search query url + f"https://www.amazon.in/laptop/s?k={product}" + # generation of search query url ) header = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", + "User-Agent": +"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", "Accept-Language": "en-US, en;q=0.5", - } # header that will indicate to the destination server that the request is coming from - # a genuine human being and not a bot + } # header that will indicate to the destination server that the + # request is coming from a genuine human being and not a bot page = requests.get(url, headers=header) page_content = page.text soup = bs(page_content) @@ -42,11 +46,12 @@ def get_product_info(product="laptop") -> None: for i, j in it.zip_longest( soup.find_all( "div", - attrs={"class": "s-result-item", "data-component-type": "s-search-result"}, + attrs={"class": "s-result-item", + "data-component-type": "s-search-result"}, ), soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}), - ): # for loop to parse through each entry and store them in the dataframe along with try.... - # except block for handling exceptions that may arise + ): # for loop to parse through each entry and store them in the dataframe + #along with try....except block for handling exceptions that may arise try: product_title = i.h2.text product_link = "https://www.amazon.in/" + i.h2.a["href"] From 0680d0113b8d15f2608cb802e1c4586c1749ad8e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 28 Oct 2022 03:51:15 +0000 Subject: [PATCH 06/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/fetch_amazon_product_data.py | 24 +++++++++----------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index 8d617e40bdb3..62062f759c29 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -15,21 +15,20 @@ from bs4 import BeautifulSoup as bs -def get_product_info(product:str = "laptop") -> None: - # function that will take the product as input and return the - #product details as output - # in the form of a csv file,if no input is given,it - #will fetch the details of laptop by default +def get_product_info(product: str = "laptop") -> None: + # function that will take the product as input and return the + # product details as output + # in the form of a csv file,if no input is given,it + # will fetch the details of laptop by default url = ( - f"https://www.amazon.in/laptop/s?k={product}" + f"https://www.amazon.in/laptop/s?k={product}" # generation of search query url ) header = { - "User-Agent": -"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", "Accept-Language": "en-US, en;q=0.5", } # header that will indicate to the destination server that the - # request is coming from a genuine human being and not a bot + # request is coming from a genuine human being and not a bot page = requests.get(url, headers=header) page_content = page.text soup = bs(page_content) @@ -46,12 +45,11 @@ def get_product_info(product:str = "laptop") -> None: for i, j in it.zip_longest( soup.find_all( "div", - attrs={"class": "s-result-item", - "data-component-type": "s-search-result"}, + attrs={"class": "s-result-item", "data-component-type": "s-search-result"}, ), soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}), - ): # for loop to parse through each entry and store them in the dataframe - #along with try....except block for handling exceptions that may arise + ): # for loop to parse through each entry and store them in the dataframe + # along with try....except block for handling exceptions that may arise try: product_title = i.h2.text product_link = "https://www.amazon.in/" + i.h2.a["href"] From 2ead046c45eb0845a85692635fbb3984ea3cf99e Mon Sep 17 00:00:00 2001 From: SparshRastogi <75373475+SparshRastogi@users.noreply.github.com> Date: Fri, 28 Oct 2022 09:58:24 +0530 Subject: [PATCH 07/13] Update fetch_amazon_product_data.py Modified function to return the data in the form of Pandas Dataframe,modified type hints and added a functionality to let the user determine if they need the data in a csv file --- web_programming/fetch_amazon_product_data.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index 62062f759c29..50b33a9cddab 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -2,10 +2,8 @@ a product name as input from the user,and fetch the necessary information about that kind of products from Amazon like the product title,link to that product,price of the product,the ratings of -the product and the discount available on the product -in the form of a csv file,this will help the users by improving searchability -and navigability and find the right product easily and in a short period of time, -it will also be beneficial for performing better analysis on products""" +the product and the discount available on the product +and return it in the form of Pandas Dataframe""" import itertools as it @@ -15,7 +13,7 @@ from bs4 import BeautifulSoup as bs -def get_product_info(product: str = "laptop") -> None: +def get_product_info(product: str = "laptop",generate_csv: bool = False) -> pd.DataFrame: # function that will take the product as input and return the # product details as output # in the form of a csv file,if no input is given,it @@ -25,7 +23,8 @@ def get_product_info(product: str = "laptop") -> None: # generation of search query url ) header = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36\ + (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", "Accept-Language": "en-US, en;q=0.5", } # header that will indicate to the destination server that the # request is coming from a genuine human being and not a bot @@ -42,7 +41,7 @@ def get_product_info(product: str = "laptop") -> None: "Discount", ] ) # initializing a pandas dataframe to store the requisite information - for i, j in it.zip_longest( + for i,j in it.zip_longest( soup.find_all( "div", attrs={"class": "s-result-item", "data-component-type": "s-search-result"}, @@ -98,6 +97,7 @@ def get_product_info(product: str = "laptop") -> None: data["Current Price of the product"] > data["MRP of the product"], "Discount" ] = " " data.index += 1 - data.to_csv( - f"Amazon Product Data({product}).csv" - ) # writing the data to the csv file + if generate_csv == True: + data.to_csv(f"Amazon Product Data({product}).csv") + # writing the data to the csv file + return data From 21acbd2af7d3acabce9dce01bca059722a81ee78 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 28 Oct 2022 04:30:09 +0000 Subject: [PATCH 08/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/fetch_amazon_product_data.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index 50b33a9cddab..10f82d7490e3 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -2,7 +2,7 @@ a product name as input from the user,and fetch the necessary information about that kind of products from Amazon like the product title,link to that product,price of the product,the ratings of -the product and the discount available on the product +the product and the discount available on the product and return it in the form of Pandas Dataframe""" @@ -13,7 +13,9 @@ from bs4 import BeautifulSoup as bs -def get_product_info(product: str = "laptop",generate_csv: bool = False) -> pd.DataFrame: +def get_product_info( + product: str = "laptop", generate_csv: bool = False +) -> pd.DataFrame: # function that will take the product as input and return the # product details as output # in the form of a csv file,if no input is given,it @@ -41,7 +43,7 @@ def get_product_info(product: str = "laptop",generate_csv: bool = False) -> pd.D "Discount", ] ) # initializing a pandas dataframe to store the requisite information - for i,j in it.zip_longest( + for i, j in it.zip_longest( soup.find_all( "div", attrs={"class": "s-result-item", "data-component-type": "s-search-result"}, @@ -98,6 +100,6 @@ def get_product_info(product: str = "laptop",generate_csv: bool = False) -> pd.D ] = " " data.index += 1 if generate_csv == True: - data.to_csv(f"Amazon Product Data({product}).csv") + data.to_csv(f"Amazon Product Data({product}).csv") # writing the data to the csv file return data From 43d2a6f8045c4d37e03b34a8f1195ad0c35ebf24 Mon Sep 17 00:00:00 2001 From: SparshRastogi <75373475+SparshRastogi@users.noreply.github.com> Date: Fri, 28 Oct 2022 18:17:17 +0530 Subject: [PATCH 09/13] Update fetch_amazon_product_data.py Made some bug fixes --- web_programming/fetch_amazon_product_data.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index 10f82d7490e3..a9521ea31ae0 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -10,7 +10,7 @@ import pandas as pd import requests -from bs4 import BeautifulSoup as bs +from bs4 import BeautifulSoup def get_product_info( @@ -25,14 +25,14 @@ def get_product_info( # generation of search query url ) header = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36\ - (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36", + "User-Agent": """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 + (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36""", "Accept-Language": "en-US, en;q=0.5", } # header that will indicate to the destination server that the # request is coming from a genuine human being and not a bot page = requests.get(url, headers=header) page_content = page.text - soup = bs(page_content) + soup = BeautifulSoup(page_content) data = pd.DataFrame( columns=[ "Product Title", @@ -80,7 +80,7 @@ def get_product_info( * 100 ) except ValueError: - discount = "" + discount = float("nan") except AttributeError: pass data.loc[len(data.index)] = [ @@ -99,7 +99,7 @@ def get_product_info( data["Current Price of the product"] > data["MRP of the product"], "Discount" ] = " " data.index += 1 - if generate_csv == True: + if generate_csv: data.to_csv(f"Amazon Product Data({product}).csv") # writing the data to the csv file return data From a7f6d4a3778ce400f849cd3c6e2a0ee439a2b348 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 28 Oct 2022 12:48:23 +0000 Subject: [PATCH 10/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/fetch_amazon_product_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/fetch_amazon_product_data.py index a9521ea31ae0..309c1c840bb7 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/fetch_amazon_product_data.py @@ -10,7 +10,7 @@ import pandas as pd import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup def get_product_info( @@ -80,7 +80,7 @@ def get_product_info( * 100 ) except ValueError: - discount = float("nan") + discount = float("nan") except AttributeError: pass data.loc[len(data.index)] = [ From 6e2712f48c33bb0a073cfa387afdc9c8b3465879 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 28 Oct 2022 16:17:57 +0200 Subject: [PATCH 11/13] Update and rename fetch_amazon_product_data.py to get_amazon_product_data.py --- ...uct_data.py => get_amazon_product_data.py} | 78 +++++++++---------- 1 file changed, 36 insertions(+), 42 deletions(-) rename web_programming/{fetch_amazon_product_data.py => get_amazon_product_data.py} (50%) diff --git a/web_programming/fetch_amazon_product_data.py b/web_programming/get_amazon_product_data.py similarity index 50% rename from web_programming/fetch_amazon_product_data.py rename to web_programming/get_amazon_product_data.py index 309c1c840bb7..9c9659317632 100644 --- a/web_programming/fetch_amazon_product_data.py +++ b/web_programming/get_amazon_product_data.py @@ -1,39 +1,31 @@ -""" This file provides a function which will take -a product name as input from the user,and fetch the necessary -information about that kind of products from Amazon like the product -title,link to that product,price of the product,the ratings of -the product and the discount available on the product -and return it in the form of Pandas Dataframe""" +""" +This file provides a function which will take a product name as input from the user, +and fetch from Amazon information about products of this name or category. The product +information will inclued title, URL, price, ratings, and the discount available. +""" -import itertools as it +from itertools import zip_longest -import pandas as pd +from pandas import DataFrame import requests from bs4 import BeautifulSoup -def get_product_info( - product: str = "laptop", generate_csv: bool = False -) -> pd.DataFrame: - # function that will take the product as input and return the - # product details as output - # in the form of a csv file,if no input is given,it - # will fetch the details of laptop by default - url = ( - f"https://www.amazon.in/laptop/s?k={product}" - # generation of search query url - ) +def get_amazon_product_data(product: str = "laptop") -> DataFrame: + """ + Take a product name or category as input and return product information from Amazon + including title, URL, price, ratings, and the discount available. + """ + url = f"https://www.amazon.in/laptop/s?k={product}" header = { "User-Agent": """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36""", "Accept-Language": "en-US, en;q=0.5", - } # header that will indicate to the destination server that the - # request is coming from a genuine human being and not a bot - page = requests.get(url, headers=header) - page_content = page.text - soup = BeautifulSoup(page_content) - data = pd.DataFrame( + } + soup = BeautifulSoup(requests.get(url, headers=header).text) + # Initialize a Pandas dataframe with the column titles + data_frame = DataFrame( columns=[ "Product Title", "Product Link", @@ -42,21 +34,21 @@ def get_product_info( "MRP of the product", "Discount", ] - ) # initializing a pandas dataframe to store the requisite information - for i, j in it.zip_longest( + ) + # Loop through each entry and store them in the dataframe + for item, _ in zip_longest( soup.find_all( "div", attrs={"class": "s-result-item", "data-component-type": "s-search-result"}, ), soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}), - ): # for loop to parse through each entry and store them in the dataframe - # along with try....except block for handling exceptions that may arise + ): try: - product_title = i.h2.text + product_title = item.h2.text product_link = "https://www.amazon.in/" + i.h2.a["href"] - product_price = i.find("span", attrs={"class": "a-offscreen"}).text + product_price = item.find("span", attrs={"class": "a-offscreen"}).text try: - product_rating = i.find("span", attrs={"class": "a-icon-alt"}).text + product_rating = item.find("span", attrs={"class": "a-icon-alt"}).text except AttributeError: product_rating = "Not available" try: @@ -83,7 +75,7 @@ def get_product_info( discount = float("nan") except AttributeError: pass - data.loc[len(data.index)] = [ + data_frame.loc[len(data_frame.index)] = [ product_title, product_link, product_price, @@ -91,15 +83,17 @@ def get_product_info( product_mrp, discount, ] - data.loc[ - data["Current Price of the product"] > data["MRP of the product"], + data_frame.loc[ + data_frame["Current Price of the product"] > data_frame["MRP of the product"], "MRP of the product", ] = " " - data.loc[ - data["Current Price of the product"] > data["MRP of the product"], "Discount" + data_frame.loc[ + data_frame["Current Price of the product"] > data_frame["MRP of the product"], "Discount" ] = " " - data.index += 1 - if generate_csv: - data.to_csv(f"Amazon Product Data({product}).csv") - # writing the data to the csv file - return data + data_frame.index += 1 + return data_frame + + +if __name__ == "__main__": + product = "headphones" + get_amazon_product_data(product).to_csv(f"Amazon Product Data for {product}.csv") From a6de1ac6ca813b14e9f2632ee59a21cef8e042dd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 28 Oct 2022 14:19:33 +0000 Subject: [PATCH 12/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/get_amazon_product_data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/web_programming/get_amazon_product_data.py b/web_programming/get_amazon_product_data.py index 9c9659317632..248652e64194 100644 --- a/web_programming/get_amazon_product_data.py +++ b/web_programming/get_amazon_product_data.py @@ -7,9 +7,9 @@ from itertools import zip_longest -from pandas import DataFrame import requests from bs4 import BeautifulSoup +from pandas import DataFrame def get_amazon_product_data(product: str = "laptop") -> DataFrame: @@ -88,7 +88,8 @@ def get_amazon_product_data(product: str = "laptop") -> DataFrame: "MRP of the product", ] = " " data_frame.loc[ - data_frame["Current Price of the product"] > data_frame["MRP of the product"], "Discount" + data_frame["Current Price of the product"] > data_frame["MRP of the product"], + "Discount", ] = " " data_frame.index += 1 return data_frame From e7ca4c250d1eab716d91c0e17d44624184ec1d72 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 28 Oct 2022 16:21:14 +0200 Subject: [PATCH 13/13] Update get_amazon_product_data.py --- web_programming/get_amazon_product_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/web_programming/get_amazon_product_data.py b/web_programming/get_amazon_product_data.py index 248652e64194..c796793f2205 100644 --- a/web_programming/get_amazon_product_data.py +++ b/web_programming/get_amazon_product_data.py @@ -1,7 +1,7 @@ """ This file provides a function which will take a product name as input from the user, and fetch from Amazon information about products of this name or category. The product -information will inclued title, URL, price, ratings, and the discount available. +information will include title, URL, price, ratings, and the discount available. """ @@ -45,7 +45,7 @@ def get_amazon_product_data(product: str = "laptop") -> DataFrame: ): try: product_title = item.h2.text - product_link = "https://www.amazon.in/" + i.h2.a["href"] + product_link = "https://www.amazon.in/" + item.h2.a["href"] product_price = item.find("span", attrs={"class": "a-offscreen"}).text try: product_rating = item.find("span", attrs={"class": "a-icon-alt"}).text @@ -54,7 +54,7 @@ def get_amazon_product_data(product: str = "laptop") -> DataFrame: try: product_mrp = ( "₹" - + i.find( + + item.find( "span", attrs={"class": "a-price a-text-price"} ).text.split("₹")[1] )