diff --git a/python/constants/code_templates.py b/python/constants/code_templates.py index 2ca196bf3..9b2b9bf5d 100644 --- a/python/constants/code_templates.py +++ b/python/constants/code_templates.py @@ -225,9 +225,11 @@ def solve(self, test_input=None): if __name__ == '__main__': import json - with open("input.json", "r") as f: + root_path = Path(__file__).parent + + with (root_path / "input.json").open("r") as f: input_json = json.load(f) - with open("output.json", "r") as f: + with (root_path / "output.json").open("r") as f: output_json = json.load(f) sol = Solution() diff --git a/python/lc_libs/contest.py b/python/lc_libs/contest.py index 9457c6be3..8d5ce76d0 100644 --- a/python/lc_libs/contest.py +++ b/python/lc_libs/contest.py @@ -1,13 +1,13 @@ -import ast import json import logging -import re from typing import List from bs4 import BeautifulSoup from python.constants import CONTEST_HISTORY_QUERY, LEET_CODE_BACKEND +from python.lc_libs import extract_outputs_from_md from python.utils import general_request +from python.utils.str_util import decode_unicode_string def get_contest_list(page_num: int = 1, page_size: int = 10): @@ -51,79 +51,47 @@ def handle_response(response): return general_request(url, handle_response, "get") + def get_contest_problem_info(contest_id: str, question_slug: str, languages: List[str], cookie: str): def handle_response(response): logging.debug(response.text) soup = BeautifulSoup(response.text, "html.parser") - code_info = soup.find("script", string=re.compile("var pageData =")) + code_info = soup.find("script", id="__NEXT_DATA__") if not code_info: logging.warning("Cookie might be expired! Please update the cookie and try again.") return None code_info_str = code_info.decode_contents() - en_title = None - cn_title = None - example_testcases = "" - sample_test_case = "" - code_definitions = None - for line in code_info_str.split("\n"): - if "questionSourceTitle" in line: - en_title = re.search(r"questionSourceTitle: '(.*?)'", line).group(1) - continue - if "questionTitle" in line: - cn_title = line.split("'")[-2] - continue - if "questionExampleTestcases" in line: - qet = re.search(r"questionExampleTestcases: '(.*)'", line).group(1) - decoded_str = qet.encode('latin-1').decode('unicode_escape') - example_testcases = decoded_str - continue - if "sampleTestCase" in line: - sample_test_case = re.search(r"sampleTestCase: '(.*)'", line).group(1) - decoded_str = sample_test_case.encode('latin-1').decode('unicode_escape') - sample_test_case = decoded_str - continue - if "codeDefinition" in line: - code_definitions = line.split(":", 1)[1].rsplit(",", 1)[0] - # """ in decoded_str - code_definitions = ast.literal_eval(code_definitions) - continue - input_vars = sample_test_case.count("\n") + 1 + code_info_json = json.loads(code_info_str) + question_json = code_info_json["props"]["pageProps"]["dehydratedState"]["queries"][1]["state"]["data"]["contestQuestion"]["question"] + question_id = question_json["questionFrontendId"] + en_title = question_json["title"] + cn_title = question_json["translatedTitle"] + + en_markdown = decode_unicode_string(question_json["content"]) + en_markdown_content = f"# {question_id}. {en_title}\n\n{en_markdown}" + cn_markdown = decode_unicode_string(question_json["translatedContent"]) + cn_markdown_content = f"# {question_id}. {cn_title}\n\n{cn_markdown}" + + example_testcase_list = question_json["exampleTestcaseList"] question_example_testcases = [] - splits = example_testcases.split("\n") - for inputs in range(0, len(splits), input_vars): - cur_inputs = [] - for i in range(inputs, inputs + input_vars): - cur_inputs.append(json.loads(splits[i])) - question_example_testcases.append(cur_inputs) + for example_testcase_str in example_testcase_list: + lt = example_testcase_str.split("\n") + cur = [] + for part in lt: + cur.append(json.loads(part)) + question_example_testcases.append(cur) + + example_outputs = extract_outputs_from_md(en_markdown_content) + code_snippets = question_json["codeSnippets"] language_default_code = {} - for code_definition in code_definitions: - if code_definition.get("value") not in languages: - continue - language_default_code[code_definition.get("value")] = code_definition.get("defaultCode") - - title = soup.find("h3") - question_id = title.text.split(".")[0] - - cn_question_content = soup.find("div", class_="question-content default-content") - if cn_question_content: - cn_markdown_content = f"# {question_id}. {cn_title}\n\n{cn_question_content.decode_contents()}" - else: - logging.warning("No CN content found for %s", question_slug) - cn_markdown_content = None - en_question_content = soup.find("div", class_="question-content source-content") - if en_question_content: - en_markdown_content = f"# {question_id}. {en_title}\n\n{en_question_content.decode_contents()}" - else: - logging.warning("No EN content found for %s", question_slug) - en_markdown_content = None - outputs = cn_question_content.find_all("span", class_="example-io") - example_outputs = [] - for output in outputs[1::2]: - example_outputs.append(json.loads(output.text)) + for code_snippet in code_snippets: + if code_snippet["langSlug"] in languages: + language_default_code[code_snippet["langSlug"]] = code_snippet["code"] + return { "question_id": question_id, - "title": title.text, + "title": en_title, "question_slug": question_slug, "en_markdown_content": en_markdown_content, "cn_markdown_content": cn_markdown_content, @@ -132,5 +100,5 @@ def handle_response(response): "language_default_code": language_default_code } - url = f"https://leetcode.cn/contest/{contest_id}/problems/{question_slug}/" + url = f"https://leetcode.cn/contest/{contest_id}/problems/{question_slug}/description" return general_request(url, handle_response, "get", cookies={"cookie": cookie}) diff --git a/python/utils/str_util.py b/python/utils/str_util.py index a143547c6..cb5e7dd2a 100644 --- a/python/utils/str_util.py +++ b/python/utils/str_util.py @@ -1,3 +1,6 @@ +import re + + def format_question_id(question_id: str) -> str: if not question_id: return question_id @@ -27,3 +30,15 @@ def back_question_id(question_id: str) -> str: if "Interview" in question_id: question_id = question_id.replace("Interview", "面试题") return question_id + + + +def decode_unicode_string(s: str) -> str: + # Use re.sub to find all occurrences of r'\uXXXX' + # and replace them with the corresponding Unicode character. + # Chinese characters and other text will remain unchanged. + s_decoded = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), s) + + # The comment "s is js encoded, decode it" implies that tmp.md contains + # literal \uXXXX sequences, which this approach handles. + return s_decoded.replace("\\n", "\n").replace("\\t", "\t")