From d98bc3b2249fc942418ef1e08d663d26b07c36f1 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 15 Oct 2025 14:39:54 +0200 Subject: [PATCH 01/22] polish cdx_toolkit example --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 8d6d7a4..3e58cb7 100644 --- a/Makefile +++ b/Makefile @@ -36,12 +36,12 @@ extract: @echo "hint: python -m json.tool extraction.json" cdx_toolkit: - @echo look up this capture in the comoncrawl cdx index - #cdxt --cc --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete - cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete + @echo look up this capture in the comoncrawl cdx index for CC-MAIN-2024-22, returning only the first match + cdxt --limit 1 --crawl CC-MAIN-2024-22 iter an.wikipedia.org/wiki/Escopete @echo - @echo extract the content from the commoncrawl s3 bucket + @echo cleanup previous work rm -f TEST-000000.extracted.warc.gz + @echo extract the content from the commoncrawl s3 bucket, using the timestamp from above cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete @echo @echo index this new warc From 5621551035c38d16869a7eade1fba02f3acbdd95 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 15 Oct 2025 14:51:32 +0200 Subject: [PATCH 02/22] wip edits --- README.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index d6d1d70..d3dd959 100644 --- a/README.md +++ b/README.md @@ -87,15 +87,15 @@ You'll see four records total, with the start of each record marked with the hea ### WET -WET (WARC Encapsulated Text) files only contain the body text of web pages extracted from the HTML and exclude any HTML code, images, or other media. This makes them useful for text analysis and natural language processing (NLP) tasks. +WET (WARC Encapsulated Text) files only contain the body text of web pages parsed from the HTML and exclude any HTML code, images, or other media. This makes them useful for text analysis and natural language processing (NLP) tasks. Open `whirlwind.warc.wet`: this is the WET derived from our original WARC. We can see that it's still in WARC format with two records: 1) a `warcinfo` record. -2) a `conversion` record: the extracted text with the HTTP headers removed. +2) a `conversion` record: the parsed text with HTTP headers removed. ### WAT -WAT (Web ARChive Timestamp) files contain metadata associated with the crawled web pages (e.g. parsed data from the HTTP response headers, links extracted from HTML pages, server response codes etc.). They are useful for analysis that requires understanding the structure of the web. +WAT (Web ARChive Timestamp) files contain metadata associated with the crawled web pages (e.g. parsed data from the HTTP response headers, links recovered from HTML pages, server response codes etc.). They are useful for analysis that requires understanding the structure of the web. Open `whirlwind.warc.wat`: this is the WAT derived from our original WARC. Like the WET file, it's also in WARC format. It contains two records: 1) a `warcinfo` record. @@ -217,9 +217,9 @@ For each of these records, there's one text line in the index - yes, it's a flat What is the purpose of this funky format? It's done this way because these flat files (300 gigabytes total per crawl) can be sorted on the primary key using any out-of-core sort utility e.g. the standard Linux `sort`, or one of the Hadoop-based out-of-core sort functions. -The JSON blob has enough information to extract individual records: it says which warc file the record is in, and the offset and length of the record. We'll use that in the next section. +The JSON blob has enough information to cleanly isolate the raw data of a single record: it defines which WARC file the record is in, and the byte offset and length of the record within this file. We'll use that in the next section. -## Task 4: Use the CDXJ index to extract raw content from the local WARC, WET, and WAT +## Task 4: Use the CDXJ index to extract a subset of raw content from the local WARC, WET, and WAT Normally, compressed files aren't random access. However, the WARC files use a trick to make this possible, which is that every record needs to be separately compressed. The `gzip` compression utility supports this, but it's rarely used. @@ -350,21 +350,23 @@ The output looks like this: Click to view output ``` -look up this capture in the comoncrawl cdx index for CC-MAIN-2024-22, returning only the first match: -cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete +look up this capture in the comoncrawl cdx index for CC-MAIN-2024-22, returning only the first match +$ cdxt --limit 1 --crawl CC-MAIN-2024-22 iter an.wikipedia.org/wiki/Escopete status 200, timestamp 20240518015810, url https://an.wikipedia.org/wiki/Escopete -extract the content from the commoncrawl s3 bucket -rm -f TEST-000000.extracted.warc.gz -cdxt --cc --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete +cleanup previous work, if any +$ rm -f TEST-000000.extracted.warc.gz +retrieve the content from the commoncrawl s3 bucket, restricting to the timestamp we were given above +$ cdxt --cc --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete +data is written to TEST-.extracted.warc.gz where starts at 000000 and counts upward if a file already exists at 000000 index this new warc -cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj -cat TEST-000000.extracted.warc.cdxj +$ cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj +$ cat TEST-000000.extracted.warc.cdxj org,wikipedia,an)/wiki/escopete 20240518015810 {"url": "https://an.wikipedia.org/wiki/Escopete", "mime": "text/html", "status": "200", "digest": "sha1:RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU", "length": "17455", "offset": "379", "filename": "TEST-000000.extracted.warc.gz"} iterate this new warc -python ./warcio-iterator.py TEST-000000.extracted.warc.gz +$ python ./warcio-iterator.py TEST-000000.extracted.warc.gz WARC-Type: warcinfo WARC-Type: response WARC-Target-URI https://an.wikipedia.org/wiki/Escopete From 1354bddc534dd601e1231103fd7a2c46a84bd51b Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Wed, 15 Oct 2025 14:51:37 +0200 Subject: [PATCH 03/22] wip edits 2 --- CC-MAIN-2024-22.warc.paths.gz | Bin 817 -> 844 bytes README.md | 11 +- notebooks/warcio_experiments.ipynb | 923 +++++++++++++++++++++++++++++ 3 files changed, 929 insertions(+), 5 deletions(-) create mode 100644 notebooks/warcio_experiments.ipynb diff --git a/CC-MAIN-2024-22.warc.paths.gz b/CC-MAIN-2024-22.warc.paths.gz index 0ff536d75299e54bb5edb342fa040d3a0743fadb..4099c937498315ba83082f995ada7387e218c4db 100644 GIT binary patch delta 46 zcmdnUc7{z=zMF&NMgH>)24-hxU0+8}KV2gOBNJUCBfav(qGY{-#FC6+hK*e6%m7J# B4SoOs delta 19 YcmX@ZwvmlXzMF#q1elmNs;V;s04W*+O8@`> diff --git a/README.md b/README.md index d6d1d70..d64c668 100644 --- a/README.md +++ b/README.md @@ -350,18 +350,19 @@ The output looks like this: Click to view output ``` -look up this capture in the comoncrawl cdx index for CC-MAIN-2024-22, returning only the first match: -cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete +look up this capture in the comoncrawl cdx index for CC-MAIN-2024-22, returning only the first match +cdxt --limit 1 --crawl CC-MAIN-2024-22 iter an.wikipedia.org/wiki/Escopete status 200, timestamp 20240518015810, url https://an.wikipedia.org/wiki/Escopete -extract the content from the commoncrawl s3 bucket +cleanup previous work rm -f TEST-000000.extracted.warc.gz -cdxt --cc --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete +extract the content from the commoncrawl s3 bucket, using the timestamp from above +cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete index this new warc cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj cat TEST-000000.extracted.warc.cdxj -org,wikipedia,an)/wiki/escopete 20240518015810 {"url": "https://an.wikipedia.org/wiki/Escopete", "mime": "text/html", "status": "200", "digest": "sha1:RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU", "length": "17455", "offset": "379", "filename": "TEST-000000.extracted.warc.gz"} +org,wikipedia,an)/wiki/escopete 20240518015810 {"url": "https://an.wikipedia.org/wiki/Escopete", "mime": "text/html", "status": "200", "digest": "sha1:RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU", "length": "17455", "offset": "406", "filename": "TEST-000000.extracted.warc.gz"} iterate this new warc python ./warcio-iterator.py TEST-000000.extracted.warc.gz diff --git a/notebooks/warcio_experiments.ipynb b/notebooks/warcio_experiments.ipynb new file mode 100644 index 0000000..c51d87a --- /dev/null +++ b/notebooks/warcio_experiments.ipynb @@ -0,0 +1,923 @@ +{ + "cells": [ + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-09T13:33:27.910213Z", + "start_time": "2025-10-09T13:33:27.895153Z" + } + }, + "cell_type": "code", + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ], + "id": "f142ae2305e8e09d", + "outputs": [], + "execution_count": 2 + }, + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2025-10-09T13:33:28.691992Z", + "start_time": "2025-10-09T13:33:28.678002Z" + } + }, + "source": "from warcio.archiveiterator import ArchiveIterator\n", + "outputs": [], + "execution_count": 3 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-09T13:10:59.883851Z", + "start_time": "2025-10-09T13:10:59.857364Z" + } + }, + "cell_type": "code", + "source": "warc_path = \"/home/cc-pds/commoncrawl/crawl-data/CC-MAIN-2024-10/segments/1707947473347.0/warc/CC-MAIN-20240220211055-20240221001055-00101.warc.gz\"", + "id": "88a4052768f17978", + "outputs": [], + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-09T13:33:31.045128Z", + "start_time": "2025-10-09T13:33:31.022226Z" + } + }, + "cell_type": "code", + "source": [ + "\n", + "def dump_all_records(warc_path, limit: int=5):\n", + " count = 0\n", + " with open(warc_path, \"rb\") as f:\n", + " for record in ArchiveIterator(f):\n", + " if record.rec_type == \"response\":\n", + " #print(record.rec_headers)\n", + " print(\"url:\", record.rec_headers.get_header(\"WARC-Target-URI\"))\n", + " print(\"content-type:\", record.http_headers.get_header(\"Content-Type\"))\n", + " content = record.content_stream().read()\n", + " print(\"content:\", content[:200])\n", + " count += 1\n", + " if count >= limit:\n", + " break\n", + "\n", + "def get_first_record(warc_path):\n", + " with open(warc_path, \"rb\") as f:\n", + " for record in ArchiveIterator(f):\n", + " if record.rec_type == \"response\":\n", + " return record" + ], + "id": "72d21cc15eb4b1c0", + "outputs": [], + "execution_count": 4 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-09T13:19:25.393165Z", + "start_time": "2025-10-09T13:19:24.977645Z" + } + }, + "cell_type": "code", + "source": "dump_all_records(warc_path, limit=200)", + "id": "16d1afcec0c6de96", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "url: http://020zxdq.baiwanx.com.cn/?user=020zxdq\n", + "content-type: text/html\n", + "content: b''\n", + "url: http://04.ma/2017/05/05/%D8%A7%D9%84%D9%81%D8%B1%D8%A7%D8%B4%D8%A9-%D8%AF%D9%8A%D8%A7%D9%84-%D9%82%D9%8A%D8%B3%D8%A7%D8%B1%D9%8A%D8%A9-%D8%B3%D8%A8%D8%A7%D8%AA%D8%A9-%D8%AF%D8%A7%D8%B1%D9%88-%D9%88%D9%82%D9%81%D8%A9-%D9%82/\n", + "content-type: text/html; charset=UTF-8\n", + "content: b'\\n\\n\\n\\n\\n\\n\\n\\n