From bffa2994275cc52f5c69c0f0eb8379955bcc7340 Mon Sep 17 00:00:00 2001 From: qubh Date: Tue, 10 Jun 2025 14:49:05 +0800 Subject: [PATCH 1/4] refactor: update contest problem info extraction and improve unicode decoding new contest fit --- python/lc_libs/contest.py | 101 +++++++++++++++----------------------- python/utils/str_util.py | 15 ++++++ 2 files changed, 54 insertions(+), 62 deletions(-) diff --git a/python/lc_libs/contest.py b/python/lc_libs/contest.py index 9457c6be3..ab7e39d39 100644 --- a/python/lc_libs/contest.py +++ b/python/lc_libs/contest.py @@ -1,13 +1,19 @@ import ast import json import logging +from os import environ +import os import re from typing import List +from pathlib import Path from bs4 import BeautifulSoup +from dotenv import load_dotenv -from python.constants import CONTEST_HISTORY_QUERY, LEET_CODE_BACKEND +from python.constants import CONTEST_HISTORY_QUERY, LEET_CODE_BACKEND, constant +from python.lc_libs import extract_outputs_from_md from python.utils import general_request +from python.utils.str_util import decode_unicode_string def get_contest_list(page_num: int = 1, page_size: int = 10): @@ -51,79 +57,50 @@ def handle_response(response): return general_request(url, handle_response, "get") + def get_contest_problem_info(contest_id: str, question_slug: str, languages: List[str], cookie: str): def handle_response(response): logging.debug(response.text) soup = BeautifulSoup(response.text, "html.parser") - code_info = soup.find("script", string=re.compile("var pageData =")) + code_info = soup.find("script", id="__NEXT_DATA__") if not code_info: logging.warning("Cookie might be expired! Please update the cookie and try again.") return None code_info_str = code_info.decode_contents() - en_title = None - cn_title = None - example_testcases = "" - sample_test_case = "" - code_definitions = None - for line in code_info_str.split("\n"): - if "questionSourceTitle" in line: - en_title = re.search(r"questionSourceTitle: '(.*?)'", line).group(1) - continue - if "questionTitle" in line: - cn_title = line.split("'")[-2] - continue - if "questionExampleTestcases" in line: - qet = re.search(r"questionExampleTestcases: '(.*)'", line).group(1) - decoded_str = qet.encode('latin-1').decode('unicode_escape') - example_testcases = decoded_str - continue - if "sampleTestCase" in line: - sample_test_case = re.search(r"sampleTestCase: '(.*)'", line).group(1) - decoded_str = sample_test_case.encode('latin-1').decode('unicode_escape') - sample_test_case = decoded_str - continue - if "codeDefinition" in line: - code_definitions = line.split(":", 1)[1].rsplit(",", 1)[0] - # """ in decoded_str - code_definitions = ast.literal_eval(code_definitions) - continue - input_vars = sample_test_case.count("\n") + 1 + code_info_json = json.loads(code_info_str) + question_json = code_info_json["props"]["pageProps"]["dehydratedState"]["queries"][1]["state"]["data"]["contestQuestion"]["question"] + question_id = question_json["questionFrontendId"] + en_title = question_json["title"] + cn_title = question_json["translatedTitle"] + + en_markdown = decode_unicode_string(question_json["content"]) + en_markdown_content = f"# {question_id}. {en_title}\n\n{en_markdown}" + cn_markdown = decode_unicode_string(question_json["translatedContent"]) + cn_markdown_content = f"# {question_id}. {cn_title}\n\n{cn_markdown}" + + example_testcase_list = question_json["exampleTestcaseList"] question_example_testcases = [] - splits = example_testcases.split("\n") - for inputs in range(0, len(splits), input_vars): - cur_inputs = [] - for i in range(inputs, inputs + input_vars): - cur_inputs.append(json.loads(splits[i])) - question_example_testcases.append(cur_inputs) + for example_testcase_str in example_testcase_list: + lt = example_testcase_str.split("\n") + if len (lt) == 1: + question_example_testcases.append(json.loads(lt[0])) + else: + cur = [] + for part in lt: + cur.append(json.loads(part)) + question_example_testcases.append(cur) + + example_outputs = extract_outputs_from_md(en_markdown_content) + code_snippets = question_json["codeSnippets"] language_default_code = {} - for code_definition in code_definitions: - if code_definition.get("value") not in languages: - continue - language_default_code[code_definition.get("value")] = code_definition.get("defaultCode") - - title = soup.find("h3") - question_id = title.text.split(".")[0] - - cn_question_content = soup.find("div", class_="question-content default-content") - if cn_question_content: - cn_markdown_content = f"# {question_id}. {cn_title}\n\n{cn_question_content.decode_contents()}" - else: - logging.warning("No CN content found for %s", question_slug) - cn_markdown_content = None - en_question_content = soup.find("div", class_="question-content source-content") - if en_question_content: - en_markdown_content = f"# {question_id}. {en_title}\n\n{en_question_content.decode_contents()}" - else: - logging.warning("No EN content found for %s", question_slug) - en_markdown_content = None - outputs = cn_question_content.find_all("span", class_="example-io") - example_outputs = [] - for output in outputs[1::2]: - example_outputs.append(json.loads(output.text)) + for code_snippet in code_snippets: + if code_snippet["langSlug"] in languages: + language_default_code[code_snippet["langSlug"]] = code_snippet["code"] + return { "question_id": question_id, - "title": title.text, + "title": en_title, "question_slug": question_slug, "en_markdown_content": en_markdown_content, "cn_markdown_content": cn_markdown_content, @@ -132,5 +109,5 @@ def handle_response(response): "language_default_code": language_default_code } - url = f"https://leetcode.cn/contest/{contest_id}/problems/{question_slug}/" + url = f"https://leetcode.cn/contest/{contest_id}/problems/{question_slug}/description" return general_request(url, handle_response, "get", cookies={"cookie": cookie}) diff --git a/python/utils/str_util.py b/python/utils/str_util.py index a143547c6..cb5e7dd2a 100644 --- a/python/utils/str_util.py +++ b/python/utils/str_util.py @@ -1,3 +1,6 @@ +import re + + def format_question_id(question_id: str) -> str: if not question_id: return question_id @@ -27,3 +30,15 @@ def back_question_id(question_id: str) -> str: if "Interview" in question_id: question_id = question_id.replace("Interview", "面试题") return question_id + + + +def decode_unicode_string(s: str) -> str: + # Use re.sub to find all occurrences of r'\uXXXX' + # and replace them with the corresponding Unicode character. + # Chinese characters and other text will remain unchanged. + s_decoded = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), s) + + # The comment "s is js encoded, decode it" implies that tmp.md contains + # literal \uXXXX sequences, which this approach handles. + return s_decoded.replace("\\n", "\n").replace("\\t", "\t") From f6567755a57ff32a458c224785acbaef329d16c2 Mon Sep 17 00:00:00 2001 From: qubh Date: Tue, 10 Jun 2025 14:50:12 +0800 Subject: [PATCH 2/4] fix: clean import --- python/lc_libs/contest.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/python/lc_libs/contest.py b/python/lc_libs/contest.py index ab7e39d39..f9fe2f355 100644 --- a/python/lc_libs/contest.py +++ b/python/lc_libs/contest.py @@ -1,16 +1,10 @@ -import ast import json import logging -from os import environ -import os -import re from typing import List -from pathlib import Path from bs4 import BeautifulSoup -from dotenv import load_dotenv -from python.constants import CONTEST_HISTORY_QUERY, LEET_CODE_BACKEND, constant +from python.constants import CONTEST_HISTORY_QUERY, LEET_CODE_BACKEND from python.lc_libs import extract_outputs_from_md from python.utils import general_request from python.utils.str_util import decode_unicode_string From 1075b98dbaf278a5a4ddbde7ed7399b31279372f Mon Sep 17 00:00:00 2001 From: qubh Date: Tue, 10 Jun 2025 15:01:06 +0800 Subject: [PATCH 3/4] fix: code template contest py input path --- python/constants/code_templates.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/constants/code_templates.py b/python/constants/code_templates.py index 2ca196bf3..9b2b9bf5d 100644 --- a/python/constants/code_templates.py +++ b/python/constants/code_templates.py @@ -225,9 +225,11 @@ def solve(self, test_input=None): if __name__ == '__main__': import json - with open("input.json", "r") as f: + root_path = Path(__file__).parent + + with (root_path / "input.json").open("r") as f: input_json = json.load(f) - with open("output.json", "r") as f: + with (root_path / "output.json").open("r") as f: output_json = json.load(f) sol = Solution() From 11aa495144879effc92eabd7e576e149e7f10a1a Mon Sep 17 00:00:00 2001 From: qubh Date: Tue, 10 Jun 2025 15:12:45 +0800 Subject: [PATCH 4/4] fix: code review --- python/lc_libs/contest.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/python/lc_libs/contest.py b/python/lc_libs/contest.py index f9fe2f355..8d5ce76d0 100644 --- a/python/lc_libs/contest.py +++ b/python/lc_libs/contest.py @@ -76,13 +76,10 @@ def handle_response(response): question_example_testcases = [] for example_testcase_str in example_testcase_list: lt = example_testcase_str.split("\n") - if len (lt) == 1: - question_example_testcases.append(json.loads(lt[0])) - else: - cur = [] - for part in lt: - cur.append(json.loads(part)) - question_example_testcases.append(cur) + cur = [] + for part in lt: + cur.append(json.loads(part)) + question_example_testcases.append(cur) example_outputs = extract_outputs_from_md(en_markdown_content)