From dfda2763178936b5c3af4ddedba962d51fd0ff9b Mon Sep 17 00:00:00 2001 From: Sarah Schneider Date: Wed, 18 Nov 2020 22:07:14 -0500 Subject: [PATCH 1/6] update formatting --- script/check-english-links.js | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/script/check-english-links.js b/script/check-english-links.js index 1fc5743c577f..77bab4fb8216 100755 --- a/script/check-english-links.js +++ b/script/check-english-links.js @@ -76,18 +76,14 @@ async function main () { const skippedLinks = result.links.filter(x => x.state === 'SKIPPED') const brokenLinks = result.links.filter(x => x.state === 'BROKEN') - console.log(dedent` - ${brokenLinks.length} broken links found on docs.github.com - - Link scan completed in ${endTime - startTime}ms - Total links: ${result.links.length} - Skipped links: ${skippedLinks.length} - Broken links: ${brokenLinks.length} - For more details see ${path.relative(process.cwd(), logFile)} - `) + console.log(`${brokenLinks.length} broken links found on docs.github.com\n`) if (brokenLinks.length) { - console.log('\n\n' + JSON.stringify(brokenLinks, null, 2)) + console.log('```') + brokenLinks.forEach(brokenLinkObj => { + console.log(JSON.stringify(brokenLinkObj, null, 2)) + }) + console.log('```') process.exit(1) } From 018b1316bf1ddcefb79ec8b7aeb4eda500c85e48 Mon Sep 17 00:00:00 2001 From: Sarah Schneider Date: Thu, 19 Nov 2020 11:34:41 -0500 Subject: [PATCH 2/6] format excluded links for use in regex --- lib/excluded-links.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/excluded-links.js b/lib/excluded-links.js index 22f8540240ef..8339a4a01f69 100644 --- a/lib/excluded-links.js +++ b/lib/excluded-links.js @@ -1,8 +1,8 @@ // Linkinator treats the following as regex. module.exports = [ // Skip GitHub search links. - 'https://github.com/search?.*', - 'https://github.com/github/gitignore/search?', + 'https://github.com/search\\?', + 'https://github.com/github/gitignore/search\\?', // These links require auth. 'https://github.com/settings/profile', @@ -15,6 +15,6 @@ module.exports = [ // Oneoff links that link checkers think are broken but are not. 'https://haveibeenpwned.com/', - 'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029', + 'https://www.ilo.org/dyn/normlex/en/f\\?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029', 'http://www.w3.org/wiki/LinkHeader/' ] From 1b5153cda1dca65fdd3544f19318044b48501935 Mon Sep 17 00:00:00 2001 From: Sarah Schneider Date: Thu, 19 Nov 2020 11:35:16 -0500 Subject: [PATCH 3/6] add retrying step and add better formatting --- script/check-english-links.js | 83 ++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 21 deletions(-) diff --git a/script/check-english-links.js b/script/check-english-links.js index 77bab4fb8216..6ba05b94d6b7 100755 --- a/script/check-english-links.js +++ b/script/check-english-links.js @@ -3,14 +3,21 @@ const path = require('path') const fs = require('fs') const linkinator = require('linkinator') -const dedent = require('dedent') const program = require('commander') -const { escapeRegExp } = require('lodash') +const { pull } = require('lodash') const checker = new linkinator.LinkChecker() const rimraf = require('rimraf').sync +const mkdirp = require('mkdirp').sync const root = 'https://docs.github.com' const englishRoot = `${root}/en` const { deprecated } = require('../lib/enterprise-server-releases') +const got = require('got') + +// Links with these codes may or may not really be broken +const retryStatusCodes = [429, 503, 'Undefined'] + +// Broken S3 image URLs result in 403s, broken docs URLs results in 404s +const allBrokenStatusCodes = [403, 404, ...retryStatusCodes] // [start-readme] // @@ -23,11 +30,11 @@ const { deprecated } = require('../lib/enterprise-server-releases') program .description('Check all links in the English docs.') .option('-d, --dry-run', 'Turn off recursion to get a fast minimal report (useful for previewing output).') + .option('-p, --path ', 'Provide an optional path to check. Best used with --dry-run. If not provided, defaults to the homepage.') .parse(process.argv) // Skip excluded links defined in separate file. const excludedLinks = require('../lib/excluded-links') - .map(link => escapeRegExp(link)) // Skip non-English content. const languagesToSkip = Object.keys(require('../lib/languages')) @@ -40,7 +47,7 @@ const languagesToSkip = Object.keys(require('../lib/languages')) const enterpriseReleasesToSkip = new RegExp(`${root}.+?[/@](${deprecated.join('|')})/`) const config = { - path: englishRoot, + path: program.path || englishRoot, concurrency: 300, // If this is a dry run, turn off recursion. recurse: !program.dryRun, @@ -56,12 +63,10 @@ const config = { main() async function main () { - const startTime = new Date() - // Clear and recreate a directory for logs. const logFile = path.join(__dirname, '../.linkinator/full.log') rimraf(path.dirname(logFile)) - fs.mkdirSync(path.dirname(logFile), { recursive: true }) + mkdirp(path.dirname(logFile)) // Update CLI output and append to logfile after each checked link. checker.on('link', result => { @@ -69,23 +74,59 @@ async function main () { }) // Start the scan; events will be logged as they occur. - const result = await checker.check(config) + const result = (await checker.check(config)).links + + // Scan is complete! Filter the results for broken links. + const brokenLinks = result + .filter(link => link.state === 'BROKEN') + // Coerce undefined status codes into strings so we can filter for them like the other status codes. + .map(link => { + if (!link.status) link.status = 'Undefined' + return link + }) - // Scan is complete! Display the results. - const endTime = new Date() - const skippedLinks = result.links.filter(x => x.state === 'SKIPPED') - const brokenLinks = result.links.filter(x => x.state === 'BROKEN') + // Links to retry individually. + const linksToRetry = brokenLinks + .filter(link => retryStatusCodes.find(retryStatusCode => link.status === retryStatusCode)) + + await Promise.all(linksToRetry + .map(async (link) => { + try { + const r = await got(link.url) + // Remove the link from the list if got can access it. + if (!allBrokenStatusCodes.find(brokenStatusCode => r.statusCode === brokenStatusCode)) { + pull(brokenLinks, link) + } + // Do nothing if the URL is invalid, since it's already captured in the broken list. + } catch (err) { + // noop + } + })) + + // Exit successfully if no broken links! + if (!brokenLinks.length) { + console.log('All links are good!') + process.exit(0) + } + // Format and display the results. console.log(`${brokenLinks.length} broken links found on docs.github.com\n`) + allBrokenStatusCodes + .forEach(statusCode => displayBrokenLinks(statusCode, brokenLinks)) - if (brokenLinks.length) { - console.log('```') - brokenLinks.forEach(brokenLinkObj => { - console.log(JSON.stringify(brokenLinkObj, null, 2)) - }) - console.log('```') - process.exit(1) - } + // Exit unsuccessfully if broken links are found. + process.exit(1) +} + +function displayBrokenLinks (statusCode, brokenLinks) { + const brokenLinksForStatus = brokenLinks.filter(x => x.status === statusCode) - process.exit(0) + if (!brokenLinksForStatus.length) return + + console.log(`## Status code ${statusCode}: Found ${brokenLinksForStatus.length} broken links`) + console.log('```') + brokenLinksForStatus.forEach(brokenLinkObj => { + console.log(JSON.stringify(brokenLinkObj, null, 2)) + }) + console.log('```') } From 53d1c1d65cb57904435c56f53a6479afeba0e4a6 Mon Sep 17 00:00:00 2001 From: Sarah Schneider Date: Thu, 19 Nov 2020 12:03:17 -0500 Subject: [PATCH 4/6] a bit more cleanup --- script/check-english-links.js | 46 +++++++++++++++++------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/script/check-english-links.js b/script/check-english-links.js index 6ba05b94d6b7..8b119a354983 100755 --- a/script/check-english-links.js +++ b/script/check-english-links.js @@ -4,7 +4,7 @@ const path = require('path') const fs = require('fs') const linkinator = require('linkinator') const program = require('commander') -const { pull } = require('lodash') +const { pull, uniq } = require('lodash') const checker = new linkinator.LinkChecker() const rimraf = require('rimraf').sync const mkdirp = require('mkdirp').sync @@ -16,14 +16,11 @@ const got = require('got') // Links with these codes may or may not really be broken const retryStatusCodes = [429, 503, 'Undefined'] -// Broken S3 image URLs result in 403s, broken docs URLs results in 404s -const allBrokenStatusCodes = [403, 404, ...retryStatusCodes] - // [start-readme] // // This script runs once per day via a scheduled GitHub Action to check all links in // English content, not including deprecated Enterprise Server content. It opens an issue -// if it finds broken links. To exclude a link, add it to `lib/excluded-links.js`. +// if it finds broken links. To exclude a link path, add it to `lib/excluded-links.js`. // // [end-readme] @@ -65,8 +62,8 @@ main() async function main () { // Clear and recreate a directory for logs. const logFile = path.join(__dirname, '../.linkinator/full.log') - rimraf(path.dirname(logFile)) - mkdirp(path.dirname(logFile)) + rimraf(logFile) + mkdirp(logFile) // Update CLI output and append to logfile after each checked link. checker.on('link', result => { @@ -79,7 +76,7 @@ async function main () { // Scan is complete! Filter the results for broken links. const brokenLinks = result .filter(link => link.state === 'BROKEN') - // Coerce undefined status codes into strings so we can filter for them like the other status codes. + // Coerce undefined status codes into strings so we can filter and display them (otherwise they stringify as 0) .map(link => { if (!link.status) link.status = 'Undefined' return link @@ -92,12 +89,11 @@ async function main () { await Promise.all(linksToRetry .map(async (link) => { try { - const r = await got(link.url) - // Remove the link from the list if got can access it. - if (!allBrokenStatusCodes.find(brokenStatusCode => r.statusCode === brokenStatusCode)) { - pull(brokenLinks, link) - } - // Do nothing if the URL is invalid, since it's already captured in the broken list. + // got throws an HTTPError if response code is not 2xx or 3xx. + // If got succeeds, we can remove the link from the list. + await got(link.url) + pull(brokenLinks, link) + // If got fails, do nothing. The link is already in the broken list. } catch (err) { // noop } @@ -111,22 +107,24 @@ async function main () { // Format and display the results. console.log(`${brokenLinks.length} broken links found on docs.github.com\n`) - allBrokenStatusCodes - .forEach(statusCode => displayBrokenLinks(statusCode, brokenLinks)) + displayBrokenLinks(brokenLinks) // Exit unsuccessfully if broken links are found. process.exit(1) } -function displayBrokenLinks (statusCode, brokenLinks) { - const brokenLinksForStatus = brokenLinks.filter(x => x.status === statusCode) +function displayBrokenLinks (brokenLinks) { + // Sort results by status code. + const allStatusCodes = uniq(brokenLinks.map(x => x.status)) - if (!brokenLinksForStatus.length) return + allStatusCodes.forEach(statusCode => { + const brokenLinksForStatus = brokenLinks.filter(x => x.status === statusCode) - console.log(`## Status code ${statusCode}: Found ${brokenLinksForStatus.length} broken links`) - console.log('```') - brokenLinksForStatus.forEach(brokenLinkObj => { - console.log(JSON.stringify(brokenLinkObj, null, 2)) + console.log(`## Status code ${statusCode}: Found ${brokenLinksForStatus.length} broken links`) + console.log('```') + brokenLinksForStatus.forEach(brokenLinkObj => { + console.log(JSON.stringify(brokenLinkObj, null, 2)) + }) + console.log('```') }) - console.log('```') } From 0a2fb5c6ac5d9cf6c6dc0cf302229e9ff88cabf0 Mon Sep 17 00:00:00 2001 From: Sarah Schneider Date: Thu, 19 Nov 2020 13:34:20 -0500 Subject: [PATCH 5/6] Update script/check-english-links.js Co-authored-by: Jason Etcovitch --- script/check-english-links.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/check-english-links.js b/script/check-english-links.js index 8b119a354983..12018954da2d 100755 --- a/script/check-english-links.js +++ b/script/check-english-links.js @@ -84,7 +84,7 @@ async function main () { // Links to retry individually. const linksToRetry = brokenLinks - .filter(link => retryStatusCodes.find(retryStatusCode => link.status === retryStatusCode)) + .filter(link => retryStatusCodes.includes(link.status)) await Promise.all(linksToRetry .map(async (link) => { From 2187e9892995136e861e25937a5bcf657e49c8aa Mon Sep 17 00:00:00 2001 From: Sarah Schneider Date: Thu, 19 Nov 2020 14:16:34 -0500 Subject: [PATCH 6/6] move some things around --- script/check-english-links.js | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/script/check-english-links.js b/script/check-english-links.js index 12018954da2d..e8be9c22eefb 100755 --- a/script/check-english-links.js +++ b/script/check-english-links.js @@ -13,8 +13,8 @@ const englishRoot = `${root}/en` const { deprecated } = require('../lib/enterprise-server-releases') const got = require('got') -// Links with these codes may or may not really be broken -const retryStatusCodes = [429, 503, 'Undefined'] +// Links with these codes may or may not really be broken. +const retryStatusCodes = [429, 503] // [start-readme] // @@ -62,8 +62,8 @@ main() async function main () { // Clear and recreate a directory for logs. const logFile = path.join(__dirname, '../.linkinator/full.log') - rimraf(logFile) - mkdirp(logFile) + rimraf(path.dirname(logFile)) + mkdirp(path.dirname(logFile)) // Update CLI output and append to logfile after each checked link. checker.on('link', result => { @@ -76,15 +76,10 @@ async function main () { // Scan is complete! Filter the results for broken links. const brokenLinks = result .filter(link => link.state === 'BROKEN') - // Coerce undefined status codes into strings so we can filter and display them (otherwise they stringify as 0) - .map(link => { - if (!link.status) link.status = 'Undefined' - return link - }) // Links to retry individually. const linksToRetry = brokenLinks - .filter(link => retryStatusCodes.includes(link.status)) + .filter(link => !link.status || retryStatusCodes.includes(link.status)) await Promise.all(linksToRetry .map(async (link) => { @@ -115,12 +110,20 @@ async function main () { function displayBrokenLinks (brokenLinks) { // Sort results by status code. - const allStatusCodes = uniq(brokenLinks.map(x => x.status)) + const allStatusCodes = uniq(brokenLinks + // Coerce undefined status codes into `Invalid` strings so we can display them. + // Without this, undefined codes get JSON.stringified as `0`, which is not useful output. + .map(link => { + if (!link.status) link.status = 'Invalid' + return link + }) + .map(link => link.status) + ) allStatusCodes.forEach(statusCode => { const brokenLinksForStatus = brokenLinks.filter(x => x.status === statusCode) - console.log(`## Status code ${statusCode}: Found ${brokenLinksForStatus.length} broken links`) + console.log(`## Status ${statusCode}: Found ${brokenLinksForStatus.length} broken links`) console.log('```') brokenLinksForStatus.forEach(brokenLinkObj => { console.log(JSON.stringify(brokenLinkObj, null, 2))