From e52d1552197b958bb3648c656286fced0be5356a Mon Sep 17 00:00:00 2001 From: saswattulo Date: Mon, 14 Jul 2025 12:20:34 +0530 Subject: [PATCH] add roboust validation --- check_readme_links.py | 178 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 153 insertions(+), 25 deletions(-) diff --git a/check_readme_links.py b/check_readme_links.py index 8147b35..b9e70fd 100644 --- a/check_readme_links.py +++ b/check_readme_links.py @@ -1,33 +1,161 @@ -# README Link Checker -# -# This script checks the online status of links in a README.md file. It extracts all URLs from the README file -# and sends a HEAD request to each URL to determine if the link is online or not. -# -# python check_readme_links.py path/to/README.md -# -# Author: Brandon Himpfen -# Website: himpfen.xyz +#!/usr/bin/env python3 +""" +README Link Checker + +This script checks the online status of links in a README.md file. It extracts all URLs from the README file +and sends HTTP requests to each URL to determine if the link is online or not. + +Usage: + python check_readme_links.py [path/to/README.md] + +If no file path is provided, it will look for README.md in the current directory. + +Author: Saswat Tulo +""" import requests import re +import sys +import os +from urllib.parse import urlparse +import argparse +from concurrent.futures import ThreadPoolExecutor, as_completed +import time -def check_links(file_path): - with open(file_path, 'r') as readme_file: - contents = readme_file.read() +def extract_urls(content): + """Extract URLs from markdown content using multiple patterns.""" + patterns = [ + r'\[.*?\]\((https?://.*?)\)', # Markdown links + r'<(https?://.*?)>', # Direct links in angle brackets + r'(?:^|\s)(https?://\S+)', # Bare URLs + ] + + urls = set() + for pattern in patterns: + matches = re.findall(pattern, content, re.MULTILINE) + urls.update(matches) + + return list(urls) - # Extract all URLs from the README file - urls = re.findall(r'\[.*\]\((http[s]?://.*?)\)', contents) +def check_single_link(url, timeout=10): + """Check if a single URL is accessible.""" + try: + # Try HEAD request first (faster) + response = requests.head(url, timeout=timeout, allow_redirects=True) + + # If HEAD fails, try GET (some servers don't support HEAD) + if response.status_code >= 400: + response = requests.get(url, timeout=timeout, allow_redirects=True) + + return { + 'url': url, + 'status_code': response.status_code, + 'status': 'online' if response.status_code < 400 else 'error', + 'error': None, + 'final_url': response.url if response.url != url else None + } + except requests.exceptions.Timeout: + return { + 'url': url, + 'status_code': None, + 'status': 'timeout', + 'error': 'Request timed out', + 'final_url': None + } + except requests.exceptions.ConnectionError: + return { + 'url': url, + 'status_code': None, + 'status': 'connection_error', + 'error': 'Connection failed', + 'final_url': None + } + except requests.exceptions.RequestException as e: + return { + 'url': url, + 'status_code': None, + 'status': 'error', + 'error': str(e), + 'final_url': None + } - for url in urls: - try: - response = requests.head(url) - if response.status_code == 200: - print(f"Link {url} is online.") +def check_links(file_path, max_workers=5, timeout=10, verbose=False): + """Check all links in a README file.""" + if not os.path.exists(file_path): + print(f"Error: File '{file_path}' not found.") + return False + + try: + with open(file_path, 'r', encoding='utf-8') as readme_file: + contents = readme_file.read() + except UnicodeDecodeError: + print(f"Error: Could not read file '{file_path}' as UTF-8.") + return False + + urls = extract_urls(contents) + + if not urls: + print("No URLs found in the README file.") + return True + + print(f"Found {len(urls)} URLs to check...\n") + + results = [] + online_count = 0 + error_count = 0 + + # Check links concurrently + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_url = {executor.submit(check_single_link, url, timeout): url for url in urls} + + for future in as_completed(future_to_url): + result = future.result() + results.append(result) + + # Print results as they come in + if result['status'] == 'online': + online_count += 1 + if verbose: + print(f"✓ {result['url']} (Status: {result['status_code']})") + if result['final_url']: + print(f" → Redirected to: {result['final_url']}") else: - print(f"Link {url} returned status code {response.status_code}.") - except requests.exceptions.RequestException as e: - print(f"Error occurred while checking link {url}: {str(e)}") + error_count += 1 + print(f"✗ {result['url']} - {result['status'].title()}") + if result['status_code']: + print(f" Status code: {result['status_code']}") + if result['error']: + print(f" Error: {result['error']}") + + # Summary + print(f"\n{'='*50}") + print(f"Summary:") + print(f"Total URLs checked: {len(urls)}") + print(f"Online: {online_count}") + print(f"Errors: {error_count}") + print(f"{'='*50}") + + return error_count == 0 + +def main(): + parser = argparse.ArgumentParser(description='Check links in README.md files') + parser.add_argument('file', nargs='?', default='README.md', + help='Path to README file (default: README.md)') + parser.add_argument('-t', '--timeout', type=int, default=10, + help='Request timeout in seconds (default: 10)') + parser.add_argument('-w', '--workers', type=int, default=5, + help='Number of concurrent workers (default: 5)') + parser.add_argument('-v', '--verbose', action='store_true', + help='Show successful links as well') + + args = parser.parse_args() + + print(f"Checking links in: {args.file}") + print(f"Timeout: {args.timeout}s | Workers: {args.workers}") + print("-" * 50) + + success = check_links(args.file, args.workers, args.timeout, args.verbose) + sys.exit(0 if success else 1) -# Provide the path to your README.md file -readme_path = 'path/to/README.md' -check_links(readme_path) +if __name__ == "__main__": + main() \ No newline at end of file