diff --git a/archivebox/extractors/example_js_extractor.js b/archivebox/extractors/example_js_extractor.js new file mode 100644 index 00000000..7ad38dd4 --- /dev/null +++ b/archivebox/extractors/example_js_extractor.js @@ -0,0 +1,300 @@ + +import {getEnvironmentConfig} from 'archivebox/util/config.js' +import {getScopeConfig} from 'archivebox/util/config.js' +import {getPuppeteerPage} from 'archivebox/util/page.js' + + +const env_config = await getEnvironmentConfig() +const snapshot_page = await archivebox.getPuppeteerPage(url, config) + + +async function extract(page) { + const cwd = process.cwd() + const config = await getScopeConfig(url=url) + const page = await archivebox.getPuppeteerPage(url, config) + + const output_path = path.join(cwd, 'screenrecording.mp4') + let recorder = null + const { + SCREENRECORDING_DURATION_LIMIT=60, + SCREENRECORDING_CODEC='libx264', + SCREENRECORDING_SAVE_GIF=true, + } = config + + page.on('setup', async () => { + + recorder = new PuppeteerScreenRecorder(page, { + followNewTab: false, + recordDurationLimit: SCREENRECORDING_DURATION_LIMIT, + // fps: 25, + // ffmpeg_Path: '' || null, + // videoFrame: { + // width: 1024, + // height: 768, + // }, + // videoCrf: 18, + videoCodec: SCREENRECORDING_CODEC, + // videoPreset: 'ultrafast', + // videoBitrate: 1000, + // autopad: { + // color: 'black' | '#35A5FF', + // }, + // aspectRatio: '4:3', + }); + + await recorder.start(output_path) + await archivebox.savePageState(page, {recorder}) + }) + await once(page, 'setup') + await once(page, 'BEHAVIORS_STARTED') + page.on('BEHAVIORS_FINISHED', async () => { + if (!recorder) return + await recorder.stop() + + // convert video to GIF + if (SCREENRECORDING_SAVE_GIF) { + try { + const BIN_NAME = process.env.FFMPEG_BINARY || 'ffmpeg' + const child = child_process.spawn( + BIN_NAME, + [ + '-hide_banner', + '-loglevel', 'error', + '-ss', '3', + '-t', '10', + '-y', + '-i', output_path, + '-vf', "fps=10,scale=1024:-1:flags=bicubic,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse", + '-loop', '0', + output_path.replace('.mp4', '.gif'), + ], + { + cwd, + timeout: 60_000, + // stdio: [null, 'pipe', 'pipe'], + stdio: 'ignore', + detached: true, // run in background, don't block on response + }, + ) + await blockUntilExists(output_path.replace('.mp4', '.gif'), {min_bytes: 100, timeout: 40_000}) + console.log(`[🎥] Saved screen-recording GIF with ffmpeg pid=${child.pid} (${duration/1000}s)...`.padEnd(82), prettyPath(output_path.replace('.mp4', '.gif'))) + } catch(err) { + console.log('[❌] Failed to convert video to GIF:', err) + } + } + }) + await once(page, 'BEHAVIORS_FINISHED') +} + +async function botArchiveTask({page, data, url=''}) { + url = url || data // puppeteer-cluster passes in the url value via the data: arg + + const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0]) + const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096)) + if (is_unarchivable_url || is_already_archived) return null + ALREADY_ARCHIVED.add(url.slice(0, 4096)) + + if (ALREADY_ARCHIVED.size > TASKS_PER_RUN_LIMIT) { + console.warn('[❌] Hit maximum URLs archived per browser session, exiting to free memory.') + console.warn(' Run this process again to continue with the next batch...') + process.exit(21) + } + + const browser = await page.browser() + const client = await page.target().createCDPSession() + const extensions = await getChromeExtensionsFromCache({browser}) + const browser_version = await browser.version() + const original_url = url.toString() + const start_time = (new Date()) + + console.log('[0/4]-------------------------------------------------------------------------') + const snapshot_dir = await setupSnapshotDir({original_url, start_time}) + const snapshot = await setupSnapshotDB({original_url, start_time, snapshot_dir}) + console.log('[1/4]-------------------------------------------------------------------------') + console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`) + + + const page_state = { + // global static state + browser, + client, + browser_version, + extensions, + + // per-page static metadata + original_url, + snapshot, + snapshot_dir, + start_time: start_time.toISOString(), + start_ts: Number(start_time), + version: versionStrFromDate(start_time), + + // per-page mutable archiving state + main_response: null, + recorder: null, + console_log: [], + traffic_log: {}, + redirects: {}, + } + page._original_url = original_url + + try { + // run all page setup functions in parallel + const results = await Promise.allSettled([ + // loadAuthStorage(page, page_state, { apply: true }), + startMetadataRecording(page, page_state), + setupURLRewriting(page, page_state), + // setupViewport(page, page_state), + setupModalAutoClosing(page, page_state), + loadCloudflareCookie(page, page_state), + startResponseSaving(page, page_state), + saveYTDLP(page, page_state), + saveGALLERYDL(page, page_state), + // saveSourceMaps(page, page_state), + // TODO: someday setup https://github.com/osnr/TabFS ? + ]); + // run all page setup functions in parallel + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason); + if (rejected.length) console.warn('[⚠️] Partial failures during page setup:', rejected); + } catch(err) { + console.error('[❌] PAGE SETUP ERROR', JSON.stringify(err, null, 4)) + return + } + + + console.log('[2/4]-------------------------------------------------------------------------') + + console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset) + const startrecording_promise = startScreenrecording(page, page_state) + page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000}) + try { + const results = await Promise.allSettled([ + startrecording_promise, + page.bringToFront(), + page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}), + ]) + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) + if (rejected.length) console.warn('[⚠️] Parial failures during page load:', rejected) + } catch(err) { + console.error('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4)) + return + } + + if (page_state.main_response === null) { + page_state.main_response = await page.waitForResponse(() => true) + } + assert(page_state.main_response) + if (page_state.main_response.status() == 429) { + throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...` + } + + // emulate human browsing behavior + // await disableAnimations(page, page_state); + await jiggleMouse(page, page_state); + await solveCaptchas(page, page_state); + await blockRedirects(page, page_state); + await scrollDown(page, page_state); + // await expandComments(page, page_state); + await submitForm(page, page_state); + // await blockJSExecution(page, page_state); + + console.log('[3/4]-------------------------------------------------------------------------') + + // stop tampering with page requests & JS / recording metadata / traffic log + await stopMetadataRecording(page, page_state) + + // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff + const saveScreenrecording_promise = saveScreenrecording(page, page_state); + await saveScreenshot(page, page_state); + await savePDF(page, page_state); + + console.log('[4/4]-------------------------------------------------------------------------') + + // do all async archiving steps that can be run at the same time + await inlineShadowDOM(page, page_state); + const results = await Promise.allSettled([ + saveTitle(page, page_state), + saveSEO(page, page_state), + saveFavicon(page, page_state), + saveSSL(page, page_state), + saveRequests(page, page_state), + saveRedirects(page, page_state), + saveHeaders(page, page_state), + saveRaw(page, page_state), + saveDOM(page, page_state), + saveBodyText(page, page_state), + // savePandoc(page, page_state), + saveReadability(page, page_state), + saveAccessibility(page, page_state), + saveOutlinks(page, page_state), + // saveAuthStorage(page, page_state), + saveAIQualityAssuranceResult(page, page_state), + ]); + + // do all sync archiving steps that require browser extensions at the very end (they are the buggiest) + const bg_results = Promise.allSettled([ + saveScreenrecording_promise, + saveSinglefile(page, page_state), + // saveArchiveWebPage(page, page_state), + // savePocket(page, page_state), + ]) + + const {duration} = await saveMetrics(page, page_state); + + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises + + if (rejected.length) + console.warn('[⚠️] Parial failures during archiving:', rejected) + + // Start an interactive REPL here with the `page` instance. + // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl + // await page.repl() + // await page.browser().repl() + + console.log(`[✅] ${ANSI.blue}Finished archiving in ${duration/1000}s.${ANSI.reset}`) + + try { + const rejected = (await bg_results) + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises + if (rejected.length) + console.warn('[⚠️] Parial failures during wrap-up tasks:', rejected) + + console.log('[🗑️] Resetting to about:blank to ensure memory is freed...') + await page.goto('about:blank') + await page.close() + } catch(err) { + console.log(err) + } + + // symlink the best results from across all the versions/ into the snapshot dir root + await symlinkBestSnapshotResults(snapshot_dir) + + // display latest version screenshot GIF + console.log() + try { + const latest_version_gif = path.join(snapshot_dir, 'versions', page_state.version, path.basename(SCREENRECORDGIF_PATH(page))) + const dirent = await blockUntilExists(latest_version_gif, {min_bytes: 100, timeout: 15_000}) + child_process.spawn('/Users/squash/.iterm2/imgcat', [dirent.abspath], {stdio: [null, 'inherit', 'inherit']}) + } catch(err) { + console.warn('[⚠️] Failed to display screenrecording.gif...', err) + console.log() + } + + // determine whether task succeeded or failed based on AI QA score + const latest_version_aiqa = path.join(snapshot_dir, 'versions', page_state.version, path.basename(AIQA_PATH(page))) + const qa_results = JSON.parse((await fs.promises.readFile(latest_version_aiqa)).toString()) + if (qa_results.pct_visible < 50) { + throw `[❌] Task completed with problems, got AI QA score of ${qa_results.pct_visible}%! ${qa_results.warnings.join(', ')} ${qa_results.error_text || ''}` + } else { + console.log(`[💫] Task completed succesfully: ${qa_results.pct_visible}% ${qa_results.warnings.join(', ') || ''}`) + console.log(` Summary: ${(qa_results.main_content_title || qa_results.description || 'No title/description detected').substring(0, 80)}... ${qa_results.main_content_author || ''} ${qa_results.main_content_date || ''}`) + return true + } +}