Skip to content

refactor: update contest problem info extraction #156

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions python/constants/code_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,11 @@ def solve(self, test_input=None):
if __name__ == '__main__':
import json

with open("input.json", "r") as f:
root_path = Path(__file__).parent

with (root_path / "input.json").open("r") as f:
input_json = json.load(f)
with open("output.json", "r") as f:
with (root_path / "output.json").open("r") as f:
output_json = json.load(f)
sol = Solution()

Expand Down
94 changes: 31 additions & 63 deletions python/lc_libs/contest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import ast
import json
import logging
import re
from typing import List

from bs4 import BeautifulSoup

from python.constants import CONTEST_HISTORY_QUERY, LEET_CODE_BACKEND
from python.lc_libs import extract_outputs_from_md
from python.utils import general_request
from python.utils.str_util import decode_unicode_string


def get_contest_list(page_num: int = 1, page_size: int = 10):
Expand Down Expand Up @@ -51,79 +51,47 @@ def handle_response(response):
return general_request(url, handle_response, "get")



def get_contest_problem_info(contest_id: str, question_slug: str, languages: List[str], cookie: str):
def handle_response(response):
logging.debug(response.text)
soup = BeautifulSoup(response.text, "html.parser")
code_info = soup.find("script", string=re.compile("var pageData ="))
code_info = soup.find("script", id="__NEXT_DATA__")
if not code_info:
logging.warning("Cookie might be expired! Please update the cookie and try again.")
return None
code_info_str = code_info.decode_contents()
en_title = None
cn_title = None
example_testcases = ""
sample_test_case = ""
code_definitions = None
for line in code_info_str.split("\n"):
if "questionSourceTitle" in line:
en_title = re.search(r"questionSourceTitle: '(.*?)'", line).group(1)
continue
if "questionTitle" in line:
cn_title = line.split("'")[-2]
continue
if "questionExampleTestcases" in line:
qet = re.search(r"questionExampleTestcases: '(.*)'", line).group(1)
decoded_str = qet.encode('latin-1').decode('unicode_escape')
example_testcases = decoded_str
continue
if "sampleTestCase" in line:
sample_test_case = re.search(r"sampleTestCase: '(.*)'", line).group(1)
decoded_str = sample_test_case.encode('latin-1').decode('unicode_escape')
sample_test_case = decoded_str
continue
if "codeDefinition" in line:
code_definitions = line.split(":", 1)[1].rsplit(",", 1)[0]
# """ in decoded_str
code_definitions = ast.literal_eval(code_definitions)
continue
input_vars = sample_test_case.count("\n") + 1
code_info_json = json.loads(code_info_str)
question_json = code_info_json["props"]["pageProps"]["dehydratedState"]["queries"][1]["state"]["data"]["contestQuestion"]["question"]
question_id = question_json["questionFrontendId"]
en_title = question_json["title"]
cn_title = question_json["translatedTitle"]

en_markdown = decode_unicode_string(question_json["content"])
en_markdown_content = f"# {question_id}. {en_title}\n\n{en_markdown}"
cn_markdown = decode_unicode_string(question_json["translatedContent"])
cn_markdown_content = f"# {question_id}. {cn_title}\n\n{cn_markdown}"

example_testcase_list = question_json["exampleTestcaseList"]
question_example_testcases = []
splits = example_testcases.split("\n")
for inputs in range(0, len(splits), input_vars):
cur_inputs = []
for i in range(inputs, inputs + input_vars):
cur_inputs.append(json.loads(splits[i]))
question_example_testcases.append(cur_inputs)
for example_testcase_str in example_testcase_list:
lt = example_testcase_str.split("\n")
cur = []
for part in lt:
cur.append(json.loads(part))
question_example_testcases.append(cur)

example_outputs = extract_outputs_from_md(en_markdown_content)

code_snippets = question_json["codeSnippets"]
language_default_code = {}
for code_definition in code_definitions:
if code_definition.get("value") not in languages:
continue
language_default_code[code_definition.get("value")] = code_definition.get("defaultCode")

title = soup.find("h3")
question_id = title.text.split(".")[0]

cn_question_content = soup.find("div", class_="question-content default-content")
if cn_question_content:
cn_markdown_content = f"# {question_id}. {cn_title}\n\n{cn_question_content.decode_contents()}"
else:
logging.warning("No CN content found for %s", question_slug)
cn_markdown_content = None
en_question_content = soup.find("div", class_="question-content source-content")
if en_question_content:
en_markdown_content = f"# {question_id}. {en_title}\n\n{en_question_content.decode_contents()}"
else:
logging.warning("No EN content found for %s", question_slug)
en_markdown_content = None
outputs = cn_question_content.find_all("span", class_="example-io")
example_outputs = []
for output in outputs[1::2]:
example_outputs.append(json.loads(output.text))
for code_snippet in code_snippets:
if code_snippet["langSlug"] in languages:
language_default_code[code_snippet["langSlug"]] = code_snippet["code"]

return {
"question_id": question_id,
"title": title.text,
"title": en_title,
"question_slug": question_slug,
"en_markdown_content": en_markdown_content,
"cn_markdown_content": cn_markdown_content,
Expand All @@ -132,5 +100,5 @@ def handle_response(response):
"language_default_code": language_default_code
}

url = f"https://leetcode.cn/contest/{contest_id}/problems/{question_slug}/"
url = f"https://leetcode.cn/contest/{contest_id}/problems/{question_slug}/description"
return general_request(url, handle_response, "get", cookies={"cookie": cookie})
15 changes: 15 additions & 0 deletions python/utils/str_util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import re


def format_question_id(question_id: str) -> str:
if not question_id:
return question_id
Expand Down Expand Up @@ -27,3 +30,15 @@ def back_question_id(question_id: str) -> str:
if "Interview" in question_id:
question_id = question_id.replace("Interview", "面试题")
return question_id



def decode_unicode_string(s: str) -> str:
# Use re.sub to find all occurrences of r'\uXXXX'
# and replace them with the corresponding Unicode character.
# Chinese characters and other text will remain unchanged.
s_decoded = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), s)

# The comment "s is js encoded, decode it" implies that tmp.md contains
# literal \uXXXX sequences, which this approach handles.
return s_decoded.replace("\\n", "\n").replace("\\t", "\t")