diff --git a/lib/excluded-links.js b/lib/excluded-links.js index 22f8540240ef..8339a4a01f69 100644 --- a/lib/excluded-links.js +++ b/lib/excluded-links.js @@ -1,8 +1,8 @@ // Linkinator treats the following as regex. module.exports = [ // Skip GitHub search links. - 'https://github.com/search?.*', - 'https://github.com/github/gitignore/search?', + 'https://github.com/search\\?', + 'https://github.com/github/gitignore/search\\?', // These links require auth. 'https://github.com/settings/profile', @@ -15,6 +15,6 @@ module.exports = [ // Oneoff links that link checkers think are broken but are not. 'https://haveibeenpwned.com/', - 'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029', + 'https://www.ilo.org/dyn/normlex/en/f\\?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029', 'http://www.w3.org/wiki/LinkHeader/' ] diff --git a/script/check-english-links.js b/script/check-english-links.js index 1fc5743c577f..e8be9c22eefb 100755 --- a/script/check-english-links.js +++ b/script/check-english-links.js @@ -3,31 +3,35 @@ const path = require('path') const fs = require('fs') const linkinator = require('linkinator') -const dedent = require('dedent') const program = require('commander') -const { escapeRegExp } = require('lodash') +const { pull, uniq } = require('lodash') const checker = new linkinator.LinkChecker() const rimraf = require('rimraf').sync +const mkdirp = require('mkdirp').sync const root = 'https://docs.github.com' const englishRoot = `${root}/en` const { deprecated } = require('../lib/enterprise-server-releases') +const got = require('got') + +// Links with these codes may or may not really be broken. +const retryStatusCodes = [429, 503] // [start-readme] // // This script runs once per day via a scheduled GitHub Action to check all links in // English content, not including deprecated Enterprise Server content. It opens an issue -// if it finds broken links. To exclude a link, add it to `lib/excluded-links.js`. +// if it finds broken links. To exclude a link path, add it to `lib/excluded-links.js`. // // [end-readme] program .description('Check all links in the English docs.') .option('-d, --dry-run', 'Turn off recursion to get a fast minimal report (useful for previewing output).') + .option('-p, --path ', 'Provide an optional path to check. Best used with --dry-run. If not provided, defaults to the homepage.') .parse(process.argv) // Skip excluded links defined in separate file. const excludedLinks = require('../lib/excluded-links') - .map(link => escapeRegExp(link)) // Skip non-English content. const languagesToSkip = Object.keys(require('../lib/languages')) @@ -40,7 +44,7 @@ const languagesToSkip = Object.keys(require('../lib/languages')) const enterpriseReleasesToSkip = new RegExp(`${root}.+?[/@](${deprecated.join('|')})/`) const config = { - path: englishRoot, + path: program.path || englishRoot, concurrency: 300, // If this is a dry run, turn off recursion. recurse: !program.dryRun, @@ -56,12 +60,10 @@ const config = { main() async function main () { - const startTime = new Date() - // Clear and recreate a directory for logs. const logFile = path.join(__dirname, '../.linkinator/full.log') rimraf(path.dirname(logFile)) - fs.mkdirSync(path.dirname(logFile), { recursive: true }) + mkdirp(path.dirname(logFile)) // Update CLI output and append to logfile after each checked link. checker.on('link', result => { @@ -69,27 +71,63 @@ async function main () { }) // Start the scan; events will be logged as they occur. - const result = await checker.check(config) - - // Scan is complete! Display the results. - const endTime = new Date() - const skippedLinks = result.links.filter(x => x.state === 'SKIPPED') - const brokenLinks = result.links.filter(x => x.state === 'BROKEN') - - console.log(dedent` - ${brokenLinks.length} broken links found on docs.github.com - - Link scan completed in ${endTime - startTime}ms - Total links: ${result.links.length} - Skipped links: ${skippedLinks.length} - Broken links: ${brokenLinks.length} - For more details see ${path.relative(process.cwd(), logFile)} - `) - - if (brokenLinks.length) { - console.log('\n\n' + JSON.stringify(brokenLinks, null, 2)) - process.exit(1) + const result = (await checker.check(config)).links + + // Scan is complete! Filter the results for broken links. + const brokenLinks = result + .filter(link => link.state === 'BROKEN') + + // Links to retry individually. + const linksToRetry = brokenLinks + .filter(link => !link.status || retryStatusCodes.includes(link.status)) + + await Promise.all(linksToRetry + .map(async (link) => { + try { + // got throws an HTTPError if response code is not 2xx or 3xx. + // If got succeeds, we can remove the link from the list. + await got(link.url) + pull(brokenLinks, link) + // If got fails, do nothing. The link is already in the broken list. + } catch (err) { + // noop + } + })) + + // Exit successfully if no broken links! + if (!brokenLinks.length) { + console.log('All links are good!') + process.exit(0) } - process.exit(0) + // Format and display the results. + console.log(`${brokenLinks.length} broken links found on docs.github.com\n`) + displayBrokenLinks(brokenLinks) + + // Exit unsuccessfully if broken links are found. + process.exit(1) +} + +function displayBrokenLinks (brokenLinks) { + // Sort results by status code. + const allStatusCodes = uniq(brokenLinks + // Coerce undefined status codes into `Invalid` strings so we can display them. + // Without this, undefined codes get JSON.stringified as `0`, which is not useful output. + .map(link => { + if (!link.status) link.status = 'Invalid' + return link + }) + .map(link => link.status) + ) + + allStatusCodes.forEach(statusCode => { + const brokenLinksForStatus = brokenLinks.filter(x => x.status === statusCode) + + console.log(`## Status ${statusCode}: Found ${brokenLinksForStatus.length} broken links`) + console.log('```') + brokenLinksForStatus.forEach(brokenLinkObj => { + console.log(JSON.stringify(brokenLinkObj, null, 2)) + }) + console.log('```') + }) }