Skip to content
Merged
6 changes: 3 additions & 3 deletions lib/excluded-links.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Linkinator treats the following as regex.
module.exports = [
// Skip GitHub search links.
'https://github.com/search?.*',
'https://github.com/github/gitignore/search?',
'https://github.com/search\\?',
'https://github.com/github/gitignore/search\\?',

// These links require auth.
'https://github.com/settings/profile',
Expand All @@ -15,6 +15,6 @@ module.exports = [

// Oneoff links that link checkers think are broken but are not.
'https://haveibeenpwned.com/',
'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
'https://www.ilo.org/dyn/normlex/en/f\\?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
'http://www.w3.org/wiki/LinkHeader/'
]
96 changes: 67 additions & 29 deletions script/check-english-links.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,35 @@
const path = require('path')
const fs = require('fs')
const linkinator = require('linkinator')
const dedent = require('dedent')
const program = require('commander')
const { escapeRegExp } = require('lodash')
const { pull, uniq } = require('lodash')
const checker = new linkinator.LinkChecker()
const rimraf = require('rimraf').sync
const mkdirp = require('mkdirp').sync
const root = 'https://docs.github.com'
const englishRoot = `${root}/en`
const { deprecated } = require('../lib/enterprise-server-releases')
const got = require('got')

// Links with these codes may or may not really be broken.
const retryStatusCodes = [429, 503]

// [start-readme]
//
// This script runs once per day via a scheduled GitHub Action to check all links in
// English content, not including deprecated Enterprise Server content. It opens an issue
// if it finds broken links. To exclude a link, add it to `lib/excluded-links.js`.
// if it finds broken links. To exclude a link path, add it to `lib/excluded-links.js`.
//
// [end-readme]

program
.description('Check all links in the English docs.')
.option('-d, --dry-run', 'Turn off recursion to get a fast minimal report (useful for previewing output).')
.option('-p, --path <PATH>', 'Provide an optional path to check. Best used with --dry-run. If not provided, defaults to the homepage.')
.parse(process.argv)

// Skip excluded links defined in separate file.
const excludedLinks = require('../lib/excluded-links')
.map(link => escapeRegExp(link))

// Skip non-English content.
const languagesToSkip = Object.keys(require('../lib/languages'))
Expand All @@ -40,7 +44,7 @@ const languagesToSkip = Object.keys(require('../lib/languages'))
const enterpriseReleasesToSkip = new RegExp(`${root}.+?[/@](${deprecated.join('|')})/`)

const config = {
path: englishRoot,
path: program.path || englishRoot,
concurrency: 300,
// If this is a dry run, turn off recursion.
recurse: !program.dryRun,
Expand All @@ -56,40 +60,74 @@ const config = {
main()

async function main () {
const startTime = new Date()

// Clear and recreate a directory for logs.
const logFile = path.join(__dirname, '../.linkinator/full.log')
rimraf(path.dirname(logFile))
fs.mkdirSync(path.dirname(logFile), { recursive: true })
mkdirp(path.dirname(logFile))

// Update CLI output and append to logfile after each checked link.
checker.on('link', result => {
fs.appendFileSync(logFile, JSON.stringify(result) + '\n')
})

// Start the scan; events will be logged as they occur.
const result = await checker.check(config)

// Scan is complete! Display the results.
const endTime = new Date()
const skippedLinks = result.links.filter(x => x.state === 'SKIPPED')
const brokenLinks = result.links.filter(x => x.state === 'BROKEN')

console.log(dedent`
${brokenLinks.length} broken links found on docs.github.com

Link scan completed in ${endTime - startTime}ms
Total links: ${result.links.length}
Skipped links: ${skippedLinks.length}
Broken links: ${brokenLinks.length}
For more details see ${path.relative(process.cwd(), logFile)}
`)

if (brokenLinks.length) {
console.log('\n\n' + JSON.stringify(brokenLinks, null, 2))
process.exit(1)
const result = (await checker.check(config)).links

// Scan is complete! Filter the results for broken links.
const brokenLinks = result
.filter(link => link.state === 'BROKEN')

// Links to retry individually.
const linksToRetry = brokenLinks
.filter(link => !link.status || retryStatusCodes.includes(link.status))

await Promise.all(linksToRetry
.map(async (link) => {
try {
// got throws an HTTPError if response code is not 2xx or 3xx.
// If got succeeds, we can remove the link from the list.
await got(link.url)
pull(brokenLinks, link)
// If got fails, do nothing. The link is already in the broken list.
} catch (err) {
// noop
}
}))

// Exit successfully if no broken links!
if (!brokenLinks.length) {
console.log('All links are good!')
process.exit(0)
}

process.exit(0)
// Format and display the results.
console.log(`${brokenLinks.length} broken links found on docs.github.com\n`)
displayBrokenLinks(brokenLinks)

// Exit unsuccessfully if broken links are found.
process.exit(1)
}

function displayBrokenLinks (brokenLinks) {
// Sort results by status code.
const allStatusCodes = uniq(brokenLinks
// Coerce undefined status codes into `Invalid` strings so we can display them.
// Without this, undefined codes get JSON.stringified as `0`, which is not useful output.
.map(link => {
if (!link.status) link.status = 'Invalid'
return link
})
.map(link => link.status)
)

allStatusCodes.forEach(statusCode => {
const brokenLinksForStatus = brokenLinks.filter(x => x.status === statusCode)

console.log(`## Status ${statusCode}: Found ${brokenLinksForStatus.length} broken links`)
console.log('```')
brokenLinksForStatus.forEach(brokenLinkObj => {
console.log(JSON.stringify(brokenLinkObj, null, 2))
})
console.log('```')
})
}