1+ # Comments:
2+ # - Running without skipping cache for the 1st time will take a while, but subsequent runs will be much faster
3+ # - Parsing functions takes a lot longer than events because there's a lot more
4+
15import requests
26from bs4 import BeautifulSoup
37from html_to_markdown import convert_to_markdown
48import yaml
59
10+ import time
611import os
712import shutil
813
9- # 🌐 URL constants
14+ # Cache of event/function Wiki pages
15+ SKIP_CACHE = False # Set to True to skip cache and always fetch fresh pages
16+ PAGES_CACHE_DIR = "./cache/pages"
17+
18+ # Function listings URLs
1019URL_CLIENT_FUNCS = "https://wiki.multitheftauto.com/wiki/Client_Scripting_Functions"
1120URL_SERVER_FUNCS = "https://wiki.multitheftauto.com/wiki/Server_Scripting_Functions"
1221URL_SHARED_FUNCS = "https://wiki.multitheftauto.com/wiki/Shared_Scripting_Functions"
1322
23+ # Event listings URLs
1424URL_CLIENT_EVENTS = "https://wiki.multitheftauto.com/wiki/Client_Scripting_Events"
1525URL_SERVER_EVENTS = "https://wiki.multitheftauto.com/wiki/Server_Scripting_Events"
1626
27+ # Output directories
1728FUNCTIONS_DIR = "./output/functions"
1829EVENTS_DIR = "./output/events"
1930
31+ # Rename some categories
2032CATEGORY_CORRECTIONS = {
2133 'SQL' : 'Database' ,
2234 'Collision_shape' : 'Colshape' ,
2335}
2436
37+ # Don't include these items from the listings
2538NAME_BLACKLIST = [
2639 'Matrix' ,
2740 'Vector'
@@ -41,7 +54,7 @@ def fix_category(category_name: str) -> str:
4154 return category_name
4255
4356def parse_links (source_label : str , url : str ) -> dict :
44- print (f"Parsing list of { source_label } from { url } ..." )
57+ print (f"Parsing list of { source_label } ..." )
4558
4659 response = requests .get (url )
4760 soup = BeautifulSoup (response .text , "html.parser" )
@@ -279,11 +292,27 @@ def parse_description(content_div):
279292 break
280293
281294 return the_description
282-
295+
296+ def get_page_from_cache_or_fetch (page_url : str , page_name : str ) -> str :
297+ """Get the page content from cache or fetch it if not cached."""
298+ cache_file = os .path .join (PAGES_CACHE_DIR , f"{ page_name } .html" )
299+ if (not SKIP_CACHE ) and os .path .exists (cache_file ):
300+ with open (cache_file , "r" , encoding = "utf-8" ) as f :
301+ return f .read ()
302+ else :
303+ # Fetch and cache the page
304+ response = requests .get (page_url )
305+ if response .status_code == 200 :
306+ with open (cache_file , "w" , encoding = "utf-8" ) as f :
307+ f .write (response .text )
308+ return response .text
309+ else :
310+ raise ValueError (f"Failed to fetch { page_url } : { response .status_code } " )
283311
284312def parse_event_page (page_url : str , category : str , name : str , source : str ) -> dict :
285- response = requests .get (page_url )
286- soup = BeautifulSoup (response .text , "html.parser" )
313+ response_text = get_page_from_cache_or_fetch (page_url , name )
314+
315+ soup = BeautifulSoup (response_text , "html.parser" )
287316
288317 # Find first non-empty p inside mw-content-text
289318 content_div = soup .find ("div" , id = "mw-content-text" )
@@ -391,7 +420,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
391420 # Examples
392421 examples = parse_examples (content_div )
393422 if len (examples ) == 0 :
394- print (f"Found no examples for { name } " )
423+ print (f"Event is missing code examples: { page_url } " )
395424
396425 # For each example, create a .lua file with the code
397426 # with name eventName-index.lua
@@ -409,7 +438,7 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
409438 added_examples .append ({
410439 "path" : 'examples/' + example_filename ,
411440 "description" : example_description ,
412- "side" : example .get ("type" ) or event_type # Default to event_type if not specified
441+ "side" : example .get ("type" ) or event_type # Default to this if not specified
413442 })
414443 example_index += 1
415444
@@ -437,8 +466,9 @@ def parse_event_page(page_url: str, category: str, name: str, source: str) -> di
437466 return yaml_dict
438467
439468def parse_function_page (page_url : str , category : str , name : str , source : str ) -> dict :
440- response = requests .get (page_url )
441- soup = BeautifulSoup (response .text , "html.parser" )
469+ response_text = get_page_from_cache_or_fetch (page_url , name )
470+
471+ soup = BeautifulSoup (response_text , "html.parser" )
442472 content_div = soup .find ("div" , id = "mw-content-text" )
443473 if not content_div :
444474 raise ValueError (f"Could not find content in { page_url } " )
@@ -450,13 +480,41 @@ def parse_function_page(page_url: str, category: str, name: str, source: str) ->
450480 raise ValueError (f"Could not find a valid description for { name } in { page_url } " )
451481
452482 func_notes , func_meta = parse_notes (content_div )
483+
484+
485+ # Examples
486+ examples = parse_examples (content_div )
487+ # if len(examples) == 0:
488+ # print(f"Function is missing code examples: {page_url}")
489+
490+
491+ # For each example, create a .lua file with the code
492+ # with name eventName-index.lua
493+ example_index = 1
494+ added_examples = []
495+ for example in examples :
496+ example_code = example .get ("code" , "" ).strip ()
497+ if example_code :
498+ example_filename = f"{ name } -{ example_index } .lua"
499+ example_path = os .path .join (FUNCTIONS_DIR , category , 'examples' , example_filename )
500+ os .makedirs (os .path .dirname (example_path ), exist_ok = True )
501+ with open (example_path , "w" , encoding = "utf-8" ) as example_file :
502+ example_file .write (example_code )
503+ example_description = example .get ("description" , "" ).strip ()
504+ added_examples .append ({
505+ "path" : 'examples/' + example_filename ,
506+ "description" : example_description ,
507+ "side" : example .get ("type" ) or func_type # Default to this if not specified
508+ })
509+ example_index += 1
510+
453511
454512 yaml_dict = {
455513 func_type : {
456514 "name" : name ,
457515 "description" : func_description ,
458516 "parameters" : [],
459- "examples" : [] ,
517+ "examples" : added_examples ,
460518 "notes" : func_notes ,
461519 "meta" : func_meta
462520 }
@@ -503,6 +561,7 @@ def convert_page_to_yaml(page_url: str, category: str, name: str, source: str) -
503561
504562def parse_items_by_source (base_dir , data_by_source ):
505563 for source , categories in data_by_source .items ():
564+ started_at = time .time ()
506565 print (f"Parsing individual pages of { source } ..." )
507566 for category , entries in categories .items ():
508567 dir_path = os .path .join (base_dir , category )
@@ -522,16 +581,21 @@ def parse_items_by_source(base_dir, data_by_source):
522581 if os .path .exists (filename ):
523582 os .remove (filename )
524583
525- print (f"YAML & Lua files for { source } written successfully to { base_dir } ." )
584+ print (f">> Parsed individual pages of { source } in { time . time () - started_at :.2f } seconds ." )
526585
527586def main ():
587+ # Create cache directory if it doesn't exist
588+ if not os .path .exists (PAGES_CACHE_DIR ):
589+ os .makedirs (PAGES_CACHE_DIR )
590+ print ("SKIP_CACHE is set to" , SKIP_CACHE )
591+
528592 functions_by_source = {}
529593 events_by_source = {}
530594
531595 # Functions
532- # functions_by_source["Shared functions"] = parse_links("Shared functions", URL_SHARED_FUNCS)
533- # functions_by_source["Client functions"] = parse_links("Client functions", URL_CLIENT_FUNCS)
534- # functions_by_source["Server functions"] = parse_links("Server functions", URL_SERVER_FUNCS)
596+ functions_by_source ["Shared functions" ] = parse_links ("Shared functions" , URL_SHARED_FUNCS )
597+ functions_by_source ["Client functions" ] = parse_links ("Client functions" , URL_CLIENT_FUNCS )
598+ functions_by_source ["Server functions" ] = parse_links ("Server functions" , URL_SERVER_FUNCS )
535599
536600 # TEST Parse only these:
537601 # functions_by_source["Shared functions"] = {
@@ -541,8 +605,8 @@ def main():
541605 # }
542606
543607 # Events
544- events_by_source ["Client events" ] = parse_links ("Client events" , URL_CLIENT_EVENTS )
545- events_by_source ["Server events" ] = parse_links ("Server events" , URL_SERVER_EVENTS )
608+ # events_by_source["Client events"] = parse_links("Client events", URL_CLIENT_EVENTS)
609+ # events_by_source["Server events"] = parse_links("Server events", URL_SERVER_EVENTS)
546610
547611 # Empty output directory
548612 if os .path .exists ("./output" ):
0 commit comments