mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
wip new plugins system with browsertrix
This commit is contained in:
parent
16adff46dd
commit
1d4ec6f825
15 changed files with 3714 additions and 3 deletions
50
archivebox/plugins/replaywebpage/extractors.py
Normal file
50
archivebox/plugins/replaywebpage/extractors.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# browsertrix extractor
|
||||
|
||||
def save_browsertrix(link, out_dir, timeout, config):
|
||||
|
||||
|
||||
browsertrix_dir = out_dir / 'browsertrix'
|
||||
browsertrix_dir.mkdir(exist_ok=True)
|
||||
|
||||
crawl_id = link.timestamp
|
||||
|
||||
browsertrix_crawler_cmd = [
|
||||
'crawl',
|
||||
f'--url', link.url,
|
||||
f'--collection={crawl_id}',
|
||||
'--scopeType=page',
|
||||
'--generateWACZ',
|
||||
'--text=final-to-warc',
|
||||
'--timeLimit=60',
|
||||
]
|
||||
|
||||
remote_cmd = """
|
||||
rm /tmp/dump.rdb;
|
||||
rm -rf /crawls/collections;
|
||||
mkdir /crawls/collections;
|
||||
env CRAWL_ID={crawl_id}
|
||||
"""
|
||||
|
||||
local_cmd = ['nc', 'browsertrix', '2222']
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
|
||||
|
||||
cmd_output = result.stdout.decode()
|
||||
|
||||
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
|
||||
|
||||
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
|
||||
|
||||
|
||||
|
||||
TEMPLATE = """
|
||||
|
||||
"""
|
||||
|
||||
# rm /tmp/dump.rdb;
|
||||
# rm -rf /crawls/collections;
|
||||
# mkdir /crawls/collections;
|
||||
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60
|
Loading…
Add table
Add a link
Reference in a new issue