prathimacode-hub · prathimacode-hub · Oct 3, 2022 · Oct 3, 2022 · Oct 3, 2022 · Oct 3, 2022
diff --git a/WebScrapingScripts/StackOverflow Question Scraper/README.md b/WebScrapingScripts/StackOverflow Question Scraper/README.md
@@ -0,0 +1,27 @@
+# StackOverflow Question Scraper
+
+## Aim
+
+The main Aim of the project is to scrape 50 questions from StackOverflow and store it in a serialized format like a JSON file.
+
+## Purpose
+
+The purpose of the project is to provide a fast way in which a user can easily see the top questions based on the tag.
+
+## Setup instructions
+
+- The Script uses BeautifulSoup to scrape contents from the Website. 
+- To avoid any version change run `pip install -r requirements.txt` in your terminal
+- After installing the dependencies run `python scrape.py`
+- Enter the tag you want to scrape and the filter and now you are good to go. 
+
+
+## Output
+
+![](./images/execution.png)
+<br/><br/><br/>
+![](./images/ouput.png)
+
+## Author
+
+[Vivek Kumar Singh](https://github.com/vivekthedev)
diff --git a/WebScrapingScripts/StackOverflow Question Scraper/images/execution.png b/WebScrapingScripts/StackOverflow Question Scraper/images/execution.png
diff --git a/WebScrapingScripts/StackOverflow Question Scraper/images/ouput.png b/WebScrapingScripts/StackOverflow Question Scraper/images/ouput.png
diff --git a/WebScrapingScripts/StackOverflow Question Scraper/requirements.txt b/WebScrapingScripts/StackOverflow Question Scraper/requirements.txt
@@ -0,0 +1,8 @@
+beautifulsoup4==4.11.1
+bs4==0.0.1
+certifi==2022.9.24
+charset-normalizer==2.1.1
+idna==3.4
+requests==2.28.1
+soupsieve==2.3.2.post1
+urllib3==1.26.12
diff --git a/WebScrapingScripts/StackOverflow Question Scraper/scrape.py b/WebScrapingScripts/StackOverflow Question Scraper/scrape.py
@@ -0,0 +1,65 @@
+from bs4 import BeautifulSoup
+import requests
+import json
+
+
+fmt = "https://stackoverflow.com/questions/tagged/{tag}?tab={filter}&pagesize=15"
+filters = [
+    "1. Newest",
+    "2. Active",
+    "3. Bounties",
+    "4. Unanswered",
+    "5. Frequent",
+    "6. Votes",
+]
+
+tag = input("enter any question tag (python, java)\n")
+print("\n".join(filters))
+filter = int(input("enter the filter number (1, 3, 5)\n"))
+
+try:
+    filter = filters[filter].split(" ")[-1]
+except:
+    filter = "Votes"
+
+# generate dynamic URL with user preferences
+URL = fmt.format(tag=tag, filter=filter)
+
+print("generated URL ", URL)
+content = requests.get(URL).content
+
+soup = BeautifulSoup(content, "lxml")
+
+# return only question tags
+def is_question(tag):
+    try:
+        return tag.get("id").startswith("question-summary-")
+    except:
+        return False
+
+
+questions = soup.find_all(is_question)
+question_data = []
+if questions:
+    # extract question data like votes, title, link and date
+    for question in questions:
+        question_dict = {}
+        question_dict["votes"] = (
+            question.find(class_="s-post-summary--stats-item-number").get_text().strip()
+        )
+        h3 = question.find(class_="s-post-summary--content-title")
+        question_dict["title"] = h3.get_text().strip()
+        question_dict["link"] = "https://stackoverflow.com" + h3.find("a").get("href")
+        question_dict["date"] = (
+            question.find(class_="s-user-card--time").span.get_text().strip()
+        )
+        question_data.append(question_dict)
+
+    with open(f"questions-{tag}.json", "w") as f:
+        json.dump(question_data, f)
+
+    print("file exported")
+
+else:
+    print(URL)
+    print("looks like there are no questions matching your tag ", tag)