diff --git a/archivebox/legacy/templates/robots.txt b/archivebox/legacy/templates/robots.txt
new file mode 100644
index 0000000000..b338083e19
--- /dev/null
+++ b/archivebox/legacy/templates/robots.txt
@@ -0,0 +1,2 @@
+User-agent: *
+ Disallow: /
diff --git a/archivebox/legacy/util.py b/archivebox/legacy/util.py
index a1c823ffae..c4f1432855 100644
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@@ -5,8 +5,9 @@
import time
import shutil
+from string import Template
from json import JSONEncoder
-from typing import List, Optional, Any, Union, IO
+from typing import List, Optional, Any, Union, IO, Mapping
from inspect import signature
from functools import wraps
from hashlib import sha256
@@ -396,10 +397,11 @@ def parse_date(date: Any) -> Optional[datetime]:
try:
return datetime.fromisoformat(date)
except Exception:
- try:
- return datetime.strptime(date, '%Y-%m-%d %H:%M')
- except Exception:
- pass
+ pass
+ try:
+ return datetime.strptime(date, '%Y-%m-%d %H:%M')
+ except Exception:
+ pass
raise ValueError('Tried to parse invalid date! {}'.format(date))
@@ -552,9 +554,12 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim
@enforce_types
def copy_and_overwrite(from_path: str, to_path: str):
- if os.path.exists(to_path):
- shutil.rmtree(to_path)
- shutil.copytree(from_path, to_path)
+ if os.path.isdir(from_path):
+ shutil.rmtree(to_path, ignore_errors=True)
+ shutil.copytree(from_path, to_path)
+ else:
+ with open(from_path, 'rb') as src:
+ atomic_write(src.read(), to_path)
@enforce_types
def chrome_args(**options) -> List[str]:
@@ -642,11 +647,27 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
return '\n'.join((header_str, *row_strs))
-def atomic_write(contents: Union[dict, str], path: str) -> None:
+@enforce_types
+def render_template(template_path: str, context: Mapping[str, str]) -> str:
+ """render a given html template string with the given template content"""
+
+ # will be replaced by django templates in the future
+ with open(template_path, 'r', encoding='utf-8') as template:
+ template_str = template.read()
+ return Template(template_str).substitute(**context)
+
+
+def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
try:
tmp_file = '{}.tmp'.format(path)
- with open(tmp_file, 'w+', encoding='utf-8') as f:
+
+ if isinstance(contents, bytes):
+ args = {'mode': 'wb+'}
+ else:
+ args = {'mode': 'w+', 'encoding': 'utf-8'}
+
+ with open(tmp_file, **args) as f:
if isinstance(contents, dict):
to_json(contents, file=f)
else:
@@ -678,3 +699,5 @@ def reject_stdin(caller: str) -> None:
))
print()
raise SystemExit(1)
+
+
From 8101ce7f23039a27f86ba030f830c8c08795fd8b Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 02:25:39 -0400
Subject: [PATCH 0029/3688] add tests
---
archivebox/tests.py | 189 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 189 insertions(+)
create mode 100755 archivebox/tests.py
diff --git a/archivebox/tests.py b/archivebox/tests.py
new file mode 100755
index 0000000000..50090e9c82
--- /dev/null
+++ b/archivebox/tests.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox'
+
+import os
+import sys
+import shutil
+import unittest
+
+from contextlib import contextmanager
+
+TEST_CONFIG = {
+ 'OUTPUT_DIR': 'data.tests',
+ 'FETCH_MEDIA': 'False',
+ 'USE_CHROME': 'False',
+ 'SUBMIT_ARCHIVE_DOT_ORG': 'False',
+ 'SHOW_PROGRESS': 'False',
+ 'USE_COLOR': 'False',
+ 'FETCH_TITLE': 'False',
+ 'FETCH_FAVICON': 'False',
+ 'FETCH_WGET': 'False',
+}
+
+OUTPUT_DIR = 'data.tests'
+os.environ.update(TEST_CONFIG)
+
+from .legacy.main import init
+from .legacy.index import load_main_index
+
+from .cli import (
+ archivebox_init,
+ archivebox_add,
+ archivebox_remove,
+)
+
+HIDE_CLI_OUTPUT = True
+
+test_urls = '''
+https://example1.com/what/is/happening.html?what=1#how-about-this=1
+https://example2.com/what/is/happening/?what=1#how-about-this=1
+HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
+https://example4.com/what/is/happening.html
+https://example5.com/
+https://example6.com
+
+http://example7.com
+[https://example8.com/what/is/this.php?what=1]
+[and http://example9.com?what=1&other=3#and-thing=2]
+https://example10.com#and-thing=2 "
+abcdef
+sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
+example13.bada
+and example14.badb
+htt://example15.badc
+'''
+
+
+@contextmanager
+def output_hidden(show_failing=True):
+ stdout = sys.stdout
+ stderr = sys.stderr
+
+ if not HIDE_CLI_OUTPUT:
+ yield
+ return
+
+ sys.stdout = open('stdout.txt', 'w+')
+ sys.stderr = open('stderr.txt', 'w+')
+ try:
+ yield
+ sys.stdout.close()
+ sys.stderr.close()
+ sys.stdout = stdout
+ sys.stderr = stderr
+ except:
+ sys.stdout.close()
+ sys.stderr.close()
+ sys.stdout = stdout
+ sys.stderr = stderr
+ if show_failing:
+ with open('stdout.txt', 'r') as f:
+ print(f.read())
+ with open('stderr.txt', 'r') as f:
+ print(f.read())
+ raise
+
+
+class TestInit(unittest.TestCase):
+ def setUp(self):
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+ def tearDown(self):
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+ def test_basic_init(self):
+ with output_hidden():
+ archivebox_init.main([])
+
+ def test_conflicting_init(self):
+ with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
+ f.write('test')
+
+ try:
+ with output_hidden(show_failing=False):
+ archivebox_init.main([])
+ assert False, 'Init should have exited with an exception'
+ except:
+ pass
+
+
+class TestAdd(unittest.TestCase):
+ def setUp(self):
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+ with output_hidden():
+ init()
+
+ def tearDown(self):
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+ def test_add_arg_url(self):
+ with output_hidden():
+ archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
+
+ all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 30
+
+ def test_add_arg_file(self):
+ test_file = os.path.join(OUTPUT_DIR, 'test.txt')
+ with open(test_file, 'w+') as f:
+ f.write(test_urls)
+
+ with output_hidden():
+ archivebox_add.main([test_file])
+
+ all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 12
+ os.remove(test_file)
+
+ def test_add_stdin_url(self):
+ with output_hidden():
+ archivebox_add.main([], stdin=test_urls)
+
+ all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 12
+
+
+class TestRemove(unittest.TestCase):
+ def setUp(self):
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+ with output_hidden():
+ init()
+ archivebox_add.main([], stdin=test_urls)
+
+ def tearDown(self):
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+
+ def test_remove_exact(self):
+ with output_hidden():
+ archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
+
+ all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 11
+
+ def test_remove_regex(self):
+ with output_hidden():
+ archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', 'http(s)?:\/\/(.+\.)?(example\d\.com)'])
+
+ all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 4
+
+ def test_remove_domain(self):
+ with output_hidden():
+ archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
+
+ all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 10
+
+ def test_remove_none(self):
+ try:
+ with output_hidden(show_failing=False):
+ archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
+ assert False, 'Should raise if no URLs match'
+ except:
+ pass
+
+
+if __name__ == '__main__':
+ unittest.main()
From ecf95d398a712f483af2569327eaaff8b75d30b3 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 02:27:38 -0400
Subject: [PATCH 0030/3688] cleanup after test output
---
archivebox/tests.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/archivebox/tests.py b/archivebox/tests.py
index 50090e9c82..80096e8a39 100755
--- a/archivebox/tests.py
+++ b/archivebox/tests.py
@@ -83,6 +83,9 @@ def output_hidden(show_failing=True):
with open('stderr.txt', 'r') as f:
print(f.read())
raise
+ finally:
+ os.remove('stdout.txt')
+ os.remove('stderr.txt')
class TestInit(unittest.TestCase):
From cdb70c73df0b593e08e00f6191e349fbbe3494c1 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 03:49:18 -0400
Subject: [PATCH 0031/3688] first working django model with archivebox-shell
command and sql exporting
---
archivebox/__init__.py | 2 ++
archivebox/cli/archivebox_remove.py | 5 ++-
archivebox/cli/archivebox_shell.py | 31 ++++++++++++++++++
archivebox/core/__init__.py | 1 +
archivebox/core/migrations/0001_initial.py | 28 ++++++++++++++++
.../migrations/0002_auto_20190417_0739.py | 27 ++++++++++++++++
archivebox/core/models.py | 32 ++++++++++++++++++-
archivebox/core/settings.py | 24 +++++++-------
archivebox/legacy/config.py | 14 ++++++--
archivebox/legacy/index.py | 16 ++++++++++
archivebox/legacy/main.py | 6 ++++
archivebox/legacy/mypy_django.ini | 10 ++++++
archivebox/legacy/storage/sql.py | 32 +++++++++++++++++++
archivebox/mypy.ini | 3 ++
archivebox/tests.py | 1 +
requirements.txt | 1 +
setup.py | 3 +-
17 files changed, 215 insertions(+), 21 deletions(-)
create mode 100644 archivebox/cli/archivebox_shell.py
create mode 100644 archivebox/core/migrations/0001_initial.py
create mode 100644 archivebox/core/migrations/0002_auto_20190417_0739.py
create mode 100644 archivebox/legacy/mypy_django.ini
create mode 100644 archivebox/legacy/storage/sql.py
create mode 100644 archivebox/mypy.ini
diff --git a/archivebox/__init__.py b/archivebox/__init__.py
index b0c00b6118..4cd3afd52e 100644
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -1 +1,3 @@
__package__ = 'archivebox'
+
+from . import core
diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py
index d2b792f5a4..26bf826291 100644
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -8,9 +8,8 @@
import argparse
-from ..legacy.main import list_archive_data, remove_archive_links
-from ..legacy.util import reject_stdin, to_csv, TimedProgress
-from ..legacy.config import ANSI
+from ..legacy.main import remove_archive_links
+from ..legacy.util import reject_stdin
def main(args=None):
diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py
new file mode 100644
index 0000000000..6fc84c4080
--- /dev/null
+++ b/archivebox/cli/archivebox_shell.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox shell'
+__description__ = 'Enter an interactive ArchiveBox Django shell'
+
+import sys
+import argparse
+
+from ..legacy.config import setup_django
+from ..legacy.util import reject_stdin
+
+
+def main(args=None):
+ args = sys.argv[1:] if args is None else args
+
+ parser = argparse.ArgumentParser(
+ prog=__command__,
+ description=__description__,
+ add_help=True,
+ )
+ parser.parse_args(args)
+ reject_stdin(__command__)
+
+ setup_django()
+ from django.core.management import call_command
+ call_command("shell_plus")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/archivebox/core/__init__.py b/archivebox/core/__init__.py
index e69de29bb2..3e1d607ae4 100644
--- a/archivebox/core/__init__.py
+++ b/archivebox/core/__init__.py
@@ -0,0 +1 @@
+__package__ = 'archivebox.core'
diff --git a/archivebox/core/migrations/0001_initial.py b/archivebox/core/migrations/0001_initial.py
new file mode 100644
index 0000000000..366db56c9c
--- /dev/null
+++ b/archivebox/core/migrations/0001_initial.py
@@ -0,0 +1,28 @@
+# Generated by Django 2.2 on 2019-04-17 06:46
+
+from django.db import migrations, models
+import uuid
+
+
+class Migration(migrations.Migration):
+
+ initial = True
+
+ dependencies = [
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='Page',
+ fields=[
+ ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
+ ('url', models.URLField()),
+ ('timestamp', models.CharField(default=None, max_length=32, null=True)),
+ ('title', models.CharField(default=None, max_length=128, null=True)),
+ ('tags', models.CharField(default=None, max_length=256, null=True)),
+ ('added', models.DateTimeField(auto_now_add=True)),
+ ('bookmarked', models.DateTimeField()),
+ ('updated', models.DateTimeField(default=None, null=True)),
+ ],
+ ),
+ ]
diff --git a/archivebox/core/migrations/0002_auto_20190417_0739.py b/archivebox/core/migrations/0002_auto_20190417_0739.py
new file mode 100644
index 0000000000..a265c13d49
--- /dev/null
+++ b/archivebox/core/migrations/0002_auto_20190417_0739.py
@@ -0,0 +1,27 @@
+# Generated by Django 2.2 on 2019-04-17 07:39
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name='page',
+ name='bookmarked',
+ ),
+ migrations.AlterField(
+ model_name='page',
+ name='timestamp',
+ field=models.CharField(default=None, max_length=32, null=True, unique=True),
+ ),
+ migrations.AlterField(
+ model_name='page',
+ name='url',
+ field=models.URLField(unique=True),
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 71a8362390..1951c37da7 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1,3 +1,33 @@
+__package__ = 'archivebox.core'
+
+import uuid
+
from django.db import models
-# Create your models here.
+
+class Page(models.Model):
+ id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
+
+ url = models.URLField(unique=True)
+ timestamp = models.CharField(unique=True, max_length=32, null=True, default=None)
+
+ title = models.CharField(max_length=128, null=True, default=None)
+ tags = models.CharField(max_length=256, null=True, default=None)
+
+ added = models.DateTimeField(auto_now_add=True)
+ updated = models.DateTimeField(null=True, default=None)
+ # bookmarked = models.DateTimeField()
+
+ sql_args = ('url', 'timestamp', 'title', 'tags', 'updated')
+
+ @classmethod
+ def from_json(cls, info: dict):
+ info = {k: v for k, v in info.items() if k in cls.sql_args}
+ return cls(**info)
+
+ def as_json(self, *args) -> dict:
+ args = args or self.sql_args
+ return {
+ key: getattr(self, key)
+ for key in args
+ }
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index b7ffbe1805..b168e6e295 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -1,24 +1,22 @@
__package__ = 'archivebox.core'
-from ..legacy.config import (
- TEMPLATES_DIR,
- DATABASE_FILE,
-)
-
+import os
SECRET_KEY = '---------------- not a valid secret key ! ----------------'
DEBUG = True
INSTALLED_APPS = [
- # 'django.contrib.admin',
- # 'django.contrib.auth',
- # 'django.contrib.contenttypes',
- # 'django.contrib.sessions',
- # 'django.contrib.messages',
- # 'django.contrib.staticfiles',
+ 'django.contrib.admin',
+ 'django.contrib.auth',
+ 'django.contrib.contenttypes',
+ 'django.contrib.sessions',
+ 'django.contrib.messages',
+ 'django.contrib.staticfiles',
'core',
+
+ 'django_extensions',
]
MIDDLEWARE = [
@@ -35,7 +33,7 @@
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
- 'DIRS': [TEMPLATES_DIR],
+ 'DIRS': ['templates'],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
@@ -53,7 +51,7 @@
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
- 'NAME': DATABASE_FILE,
+ 'NAME': os.path.join(os.path.abspath(os.curdir), 'database', 'database.sqlite3'),
}
}
diff --git a/archivebox/legacy/config.py b/archivebox/legacy/config.py
index c158e52b0d..8842b79363 100644
--- a/archivebox/legacy/config.py
+++ b/archivebox/legacy/config.py
@@ -1,14 +1,15 @@
+__package__ = 'archivebox.legacy'
+
import os
import re
import sys
-import getpass
import django
+import getpass
import shutil
from typing import Optional
from subprocess import run, PIPE, DEVNULL
-
# ******************************************************************************
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
@@ -93,10 +94,11 @@ def stderr(*args):
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
DATABASE_DIR_NAME = 'database'
+DATABASE_FILE_NAME = 'database.sqlite3'
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
DATABASE_DIR = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME)
-DATABASE_FILE = os.path.join(DATABASE_DIR, 'database.sqlite3')
+DATABASE_FILE = os.path.join(DATABASE_DIR, DATABASE_FILE_NAME)
PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox')
LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy')
@@ -221,6 +223,12 @@ def find_chrome_data_dir() -> Optional[str]:
return None
+def setup_django():
+ import django
+ sys.path.append(PYTHON_DIR)
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
+ django.setup()
+
# ******************************************************************************
# ************************ Environment & Dependencies **************************
# ******************************************************************************
diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py
index 4df15e3048..173d6b7cf0 100644
--- a/archivebox/legacy/index.py
+++ b/archivebox/legacy/index.py
@@ -6,6 +6,8 @@
from .schema import Link, ArchiveResult
from .config import (
+ DATABASE_DIR,
+ DATABASE_FILE_NAME,
OUTPUT_DIR,
TIMEOUT,
URL_BLACKLIST_PTN,
@@ -19,6 +21,10 @@
parse_json_link_details,
write_json_link_details,
)
+from .storage.sql import (
+ write_sql_main_index,
+ parse_sql_main_index,
+)
from .util import (
scheme,
enforce_types,
@@ -204,6 +210,14 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
log_indexing_process_started()
+ log_indexing_started(DATABASE_DIR, DATABASE_FILE_NAME)
+ timer = TimedProgress(TIMEOUT * 2, prefix=' ')
+ try:
+ write_sql_main_index(links)
+ finally:
+ timer.end()
+ log_indexing_finished(DATABASE_DIR, DATABASE_FILE_NAME)
+
log_indexing_started(out_dir, 'index.json')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
@@ -228,6 +242,8 @@ def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) ->
existing_links: List[Link] = []
if out_dir:
existing_links = list(parse_json_main_index(out_dir))
+ existing_sql_links = list(parse_sql_main_index())
+ assert set(l.url for l in existing_links) == set(l['url'] for l in existing_sql_links)
new_links: List[Link] = []
if import_path:
diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py
index c437d5d4bd..72e949ad0a 100644
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@@ -22,6 +22,7 @@
DATABASE_DIR,
check_dependencies,
check_data_folder,
+ setup_django,
)
from .logs import (
log_archiving_started,
@@ -75,6 +76,11 @@ def init():
write_main_index([], out_dir=OUTPUT_DIR, finished=True)
+ setup_django()
+ from django.core.management import call_command
+ call_command("makemigrations", interactive=False)
+ call_command("migrate", interactive=False)
+
stderr('{green}[√] Done.{reset}'.format(**ANSI))
diff --git a/archivebox/legacy/mypy_django.ini b/archivebox/legacy/mypy_django.ini
new file mode 100644
index 0000000000..306e567cd2
--- /dev/null
+++ b/archivebox/legacy/mypy_django.ini
@@ -0,0 +1,10 @@
+[mypy_django_plugin]
+
+# specify settings module to use for django.conf.settings, this setting
+# could also be specified with DJANGO_SETTINGS_MODULE environment variable
+# (it also takes priority over config file)
+django_settings = core.settings
+
+# if True, all unknown settings in django.conf.settings will fallback to Any,
+# specify it if your settings are loaded dynamically to avoid false positives
+ignore_missing_settings = True
diff --git a/archivebox/legacy/storage/sql.py b/archivebox/legacy/storage/sql.py
new file mode 100644
index 0000000000..c4f03bb0c6
--- /dev/null
+++ b/archivebox/legacy/storage/sql.py
@@ -0,0 +1,32 @@
+__package__ = 'archivebox.legacy.storage'
+
+from typing import List, Iterator
+
+from ..schema import Link
+from ..util import enforce_types
+from ..config import setup_django
+
+
+### Main Links Index
+
+sql_keys = ('url', 'timestamp', 'title', 'tags', 'updated')
+
+
+@enforce_types
+def parse_sql_main_index() -> Iterator[Link]:
+ setup_django()
+ from core.models import Page
+
+ return (
+ page.as_json(*sql_keys)
+ for page in Page.objects.all()
+ )
+
+@enforce_types
+def write_sql_main_index(links: List[Link]) -> None:
+ setup_django()
+ from core.models import Page
+
+ for link in links:
+ info = {k: v for k, v in link._asdict().items() if k in sql_keys}
+ Page.objects.update_or_create(url=link.url, defaults=info)
diff --git a/archivebox/mypy.ini b/archivebox/mypy.ini
new file mode 100644
index 0000000000..b1b4489ae4
--- /dev/null
+++ b/archivebox/mypy.ini
@@ -0,0 +1,3 @@
+[mypy]
+plugins =
+ mypy_django_plugin.main
diff --git a/archivebox/tests.py b/archivebox/tests.py
index 80096e8a39..6afb6c7d6c 100755
--- a/archivebox/tests.py
+++ b/archivebox/tests.py
@@ -2,6 +2,7 @@
__package__ = 'archivebox'
+
import os
import sys
import shutil
diff --git a/requirements.txt b/requirements.txt
index eb9861dd5f..d7b43bc14e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,7 @@ base32-crockford
setuptools
ipdb
mypy
+django-stubs
flake8
#wpull
diff --git a/setup.py b/setup.py
index b6137740aa..1c048d8a6d 100644
--- a/setup.py
+++ b/setup.py
@@ -36,9 +36,10 @@
packages=setuptools.find_packages(),
python_requires='>=3.6',
install_requires=[
+ "dataclasses==0.6",
"base32-crockford==0.3.0",
"django==2.2",
- "dataclasses==0.6",
+ "django-extensions==2.1.6",
],
entry_points={
'console_scripts': [
From 35aa8c8902dc7a68b8954e7d113bd4ac17650482 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 03:50:41 -0400
Subject: [PATCH 0032/3688] clearer sql parsing and dumping
---
archivebox/core/models.py | 6 +++---
archivebox/legacy/storage/sql.py | 7 ++-----
2 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 1951c37da7..6fdcdae2c0 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -18,15 +18,15 @@ class Page(models.Model):
updated = models.DateTimeField(null=True, default=None)
# bookmarked = models.DateTimeField()
- sql_args = ('url', 'timestamp', 'title', 'tags', 'updated')
+ keys = ('url', 'timestamp', 'title', 'tags', 'updated')
@classmethod
def from_json(cls, info: dict):
- info = {k: v for k, v in info.items() if k in cls.sql_args}
+ info = {k: v for k, v in info.items() if k in cls.keys}
return cls(**info)
def as_json(self, *args) -> dict:
- args = args or self.sql_args
+ args = args or self.keys
return {
key: getattr(self, key)
for key in args
diff --git a/archivebox/legacy/storage/sql.py b/archivebox/legacy/storage/sql.py
index c4f03bb0c6..90a0c41225 100644
--- a/archivebox/legacy/storage/sql.py
+++ b/archivebox/legacy/storage/sql.py
@@ -9,16 +9,13 @@
### Main Links Index
-sql_keys = ('url', 'timestamp', 'title', 'tags', 'updated')
-
-
@enforce_types
def parse_sql_main_index() -> Iterator[Link]:
setup_django()
from core.models import Page
return (
- page.as_json(*sql_keys)
+ page.as_json(*Page.keys)
for page in Page.objects.all()
)
@@ -28,5 +25,5 @@ def write_sql_main_index(links: List[Link]) -> None:
from core.models import Page
for link in links:
- info = {k: v for k, v in link._asdict().items() if k in sql_keys}
+ info = {k: v for k, v in link._asdict().items() if k in Page.keys}
Page.objects.update_or_create(url=link.url, defaults=info)
From 88a37bc552b5d12cce75afbeb89c844267e9bd4e Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 05:41:41 -0400
Subject: [PATCH 0033/3688] fix json list output
---
archivebox/cli/archivebox_list.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py
index 337bebac96..d421f8de90 100644
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@@ -76,10 +76,11 @@ def main(args=None):
if command.sort:
links = sorted(links, key=lambda link: getattr(link, command.sort))
+
if command.csv:
print(to_csv(links, csv_cols=command.csv.split(','), header=True))
elif command.json:
- print(to_json(links, indent=4, sort_keys=True))
+ print(to_json(list(links), indent=4, sort_keys=True))
else:
print('\n'.join(link.url for link in links))
From 289a6ea30f3d34a72f539d72f17f10f9d14d637b Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 05:42:09 -0400
Subject: [PATCH 0034/3688] fix database file location and init process
---
archivebox/core/settings.py | 11 +++++---
archivebox/legacy/config.py | 4 +--
archivebox/legacy/logs.py | 2 +-
archivebox/legacy/main.py | 54 +++++++++++++++++++++----------------
4 files changed, 42 insertions(+), 29 deletions(-)
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index b168e6e295..ff1fbe674c 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -5,6 +5,11 @@
SECRET_KEY = '---------------- not a valid secret key ! ----------------'
DEBUG = True
+OUTPUT_DIR = os.path.abspath(os.curdir)
+DATABASE_DIR_NAME = 'database'
+DATABASE_FILE_NAME = 'database.sqlite3'
+DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
+
INSTALLED_APPS = [
'django.contrib.admin',
@@ -15,7 +20,7 @@
'django.contrib.staticfiles',
'core',
-
+
'django_extensions',
]
@@ -51,7 +56,7 @@
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
- 'NAME': os.path.join(os.path.abspath(os.curdir), 'database', 'database.sqlite3'),
+ 'NAME': DATABASE_FILE,
}
}
@@ -67,7 +72,7 @@
TIME_ZONE = 'UTC'
USE_I18N = True
USE_L10N = True
-USE_TZ = True
+USE_TZ = False
STATIC_URL = '/static/'
diff --git a/archivebox/legacy/config.py b/archivebox/legacy/config.py
index 8842b79363..64c4ce8780 100644
--- a/archivebox/legacy/config.py
+++ b/archivebox/legacy/config.py
@@ -98,7 +98,7 @@ def stderr(*args):
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
DATABASE_DIR = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME)
-DATABASE_FILE = os.path.join(DATABASE_DIR, DATABASE_FILE_NAME)
+DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox')
LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy')
@@ -346,7 +346,7 @@ def setup_django():
'DATABASE_DIR': {
'path': os.path.abspath(DATABASE_DIR),
'enabled': True,
- 'is_valid': os.path.exists(os.path.join(DATABASE_DIR, DATABASE_FILE)),
+ 'is_valid': os.path.exists(DATABASE_FILE),
},
'CHROME_USER_DATA_DIR': {
'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR),
diff --git a/archivebox/legacy/logs.py b/archivebox/legacy/logs.py
index 8b0dda9f9b..0f3eb5dc84 100644
--- a/archivebox/legacy/logs.py
+++ b/archivebox/legacy/logs.py
@@ -71,7 +71,7 @@ def log_indexing_started(out_dir: str, out_file: str):
def log_indexing_finished(out_dir: str, out_file: str):
end_ts = datetime.now()
_LAST_RUN_STATS.index_end_ts = end_ts
- print('\r √ {}/{}'.format(pretty_path(out_dir), out_file))
+ print('\r √ {}/{}'.format(out_dir, out_file))
### Archiving Stage
diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py
index 72e949ad0a..0dd4ffd668 100644
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@@ -20,6 +20,7 @@
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
+ DATABASE_FILE,
check_dependencies,
check_data_folder,
setup_django,
@@ -39,21 +40,19 @@
def init():
os.makedirs(OUTPUT_DIR, exist_ok=True)
- harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
+ harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv', 'sources', 'archive', 'database', 'logs', 'static'}
is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files)
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
- if not is_empty:
+ if is_empty:
+ stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
+ write_main_index([], out_dir=OUTPUT_DIR, finished=True)
+ else:
if existing_index:
- stderr('{green}[√] You already have an archive index in: {}{reset}'.format(OUTPUT_DIR, **ANSI))
- stderr(' To add new links, you can run:')
- stderr(" archivebox add 'https://example.com'")
- stderr()
- stderr(' For more usage and examples, run:')
- stderr(' archivebox help')
- # TODO: import old archivebox version's archive data folder
-
- raise SystemExit(1)
+ stderr('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI))
+ stderr(f' {OUTPUT_DIR}')
+ stderr(f' > index.html')
+ stderr(f' > index.json')
else:
stderr(
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
@@ -65,23 +64,32 @@ def init():
)
raise SystemExit(1)
-
- stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
- os.makedirs(SOURCES_DIR)
- stderr(f' > {SOURCES_DIR}')
- os.makedirs(ARCHIVE_DIR)
- stderr(f' > {ARCHIVE_DIR}')
- os.makedirs(DATABASE_DIR)
- stderr(f' > {DATABASE_DIR}')
-
- write_main_index([], out_dir=OUTPUT_DIR, finished=True)
+ os.makedirs(SOURCES_DIR, exist_ok=True)
+ stderr(f' > sources/')
+ os.makedirs(ARCHIVE_DIR, exist_ok=True)
+ stderr(f' > archive/')
+ os.makedirs(DATABASE_DIR, exist_ok=True)
setup_django()
from django.core.management import call_command
+ from django.contrib.auth.models import User
+ stderr(f' > database/')
+
+ stderr('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
call_command("makemigrations", interactive=False)
call_command("migrate", interactive=False)
-
- stderr('{green}[√] Done.{reset}'.format(**ANSI))
+
+ if not User.objects.filter(is_superuser=True).exists():
+ stderr('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
+ call_command("createsuperuser", interactive=True)
+
+ stderr('\n{green}------------------------------------------------------------{reset}'.format(**ANSI))
+ stderr('{green}[√] Done. ArchiveBox collection is set up in current folder.{reset}'.format(**ANSI))
+ stderr(' To add new links, you can run:')
+ stderr(" archivebox add 'https://example.com'")
+ stderr()
+ stderr(' For more usage and examples, run:')
+ stderr(' archivebox help')
From 669bd6bee43430d75b8718cb17f373aaed7d3c86 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 05:42:21 -0400
Subject: [PATCH 0035/3688] first views for archivebox server
---
archivebox/core/urls.py | 22 ++++++----------------
archivebox/core/views.py | 16 +++++++++++++++-
2 files changed, 21 insertions(+), 17 deletions(-)
diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py
index a077ec78dd..a105c91c94 100644
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -1,21 +1,11 @@
-"""archivebox URL Configuration
-
-The `urlpatterns` list routes URLs to views. For more information please see:
- https://docs.djangoproject.com/en/2.1/topics/http/urls/
-Examples:
-Function views
- 1. Add an import: from my_app import views
- 2. Add a URL to urlpatterns: path('', views.home, name='home')
-Class-based views
- 1. Add an import: from other_app.views import Home
- 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
-Including another URLconf
- 1. Import the include() function: from django.urls import include, path
- 2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
-"""
from django.contrib import admin
from django.urls import path
+
+from core.views import MainIndex, LinkDetails
+
urlpatterns = [
- path('admin/', admin.site.urls),
+ path('', admin.site.urls),
+ path('archive//', LinkDetails.as_view(), name='LinkDetails'),
+ path('main/', MainIndex.as_view(), name='Home'),
]
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 91ea44a218..2d429ee2e5 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -1,3 +1,17 @@
from django.shortcuts import render
-# Create your views here.
+from django.views import View
+
+
+class MainIndex(View):
+ template = 'main_index.html'
+
+ def get(self, request):
+ return render(self.template, {})
+
+
+class LinkDetails(View):
+ template = 'link_details.html'
+
+ def get(self, request):
+ return render(self.template, {})
From 920898e160e5049989967fd9837c386904cd9fdd Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 05:42:35 -0400
Subject: [PATCH 0036/3688] working archivebox-server command
---
archivebox/cli/archivebox_server.py | 38 +++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
create mode 100644 archivebox/cli/archivebox_server.py
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
new file mode 100644
index 0000000000..4113ed10f4
--- /dev/null
+++ b/archivebox/cli/archivebox_server.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox server'
+__description__ = 'Run the ArchiveBox HTTP server'
+
+import sys
+import argparse
+
+from ..legacy.config import setup_django
+from ..legacy.util import reject_stdin
+
+
+def main(args=None):
+ args = sys.argv[1:] if args is None else args
+
+ parser = argparse.ArgumentParser(
+ prog=__command__,
+ description=__description__,
+ add_help=True,
+ )
+ parser.add_argument(
+ 'runserver_args',
+ nargs='*',
+ type=str,
+ default=None,
+ help='Arguments to pass to Django runserver'
+ )
+ command = parser.parse_args(args)
+ reject_stdin(__command__)
+
+ setup_django()
+ from django.core.management import call_command
+ call_command("runserver", *command.runserver_args)
+
+
+if __name__ == '__main__':
+ main()
From 4f869f235f322edca1d6d831b294cdc46e3bfc07 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Wed, 17 Apr 2019 22:00:54 -0400
Subject: [PATCH 0037/3688] add package headers
---
archivebox/legacy/storage/html.py | 2 ++
archivebox/legacy/storage/json.py | 2 ++
2 files changed, 4 insertions(+)
diff --git a/archivebox/legacy/storage/html.py b/archivebox/legacy/storage/html.py
index 2ca4a2fcbe..bc58cb566e 100644
--- a/archivebox/legacy/storage/html.py
+++ b/archivebox/legacy/storage/html.py
@@ -1,3 +1,5 @@
+__package__ = 'archivebox.legacy.storage'
+
import os
from datetime import datetime
diff --git a/archivebox/legacy/storage/json.py b/archivebox/legacy/storage/json.py
index de581910fd..697d318b02 100644
--- a/archivebox/legacy/storage/json.py
+++ b/archivebox/legacy/storage/json.py
@@ -1,3 +1,5 @@
+__package__ = 'archivebox.legacy.storage'
+
import os
import json
From 39a0ab30138be1f816d979aa046689a8e9f3d618 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Thu, 18 Apr 2019 21:09:54 -0400
Subject: [PATCH 0038/3688] add pipenv, schedule cmd, logs dir, and lots more
---
Pipfile | 22 ++
Pipfile.lock | 314 ++++++++++++++++++++++++++
archivebox/cli/__init__.py | 49 +++-
archivebox/cli/archivebox_add.py | 1 +
archivebox/cli/archivebox_init.py | 1 -
archivebox/cli/archivebox_schedule.py | 194 ++++++++++++++++
archivebox/cli/archivebox_server.py | 4 +-
archivebox/cli/archivebox_shell.py | 4 +-
archivebox/core/settings.py | 8 +-
archivebox/env.py | 15 --
archivebox/legacy/config.py | 42 ++--
archivebox/legacy/index.py | 90 ++++----
archivebox/legacy/logs.py | 24 +-
archivebox/legacy/main.py | 101 ++++++---
archivebox/legacy/storage/html.py | 5 +-
archivebox/legacy/storage/json.py | 39 +++-
archivebox/legacy/storage/sql.py | 10 +-
archivebox/tests.py | 51 ++++-
requirements.txt | 17 --
setup.py | 11 +-
20 files changed, 817 insertions(+), 185 deletions(-)
create mode 100644 Pipfile
create mode 100644 Pipfile.lock
create mode 100644 archivebox/cli/archivebox_schedule.py
delete mode 100644 archivebox/env.py
delete mode 100644 requirements.txt
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000000..4ba4d08e4b
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,22 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+ipdb = "*"
+flake8 = "*"
+mypy = "*"
+django-stubs = "*"
+setuptools = "*"
+
+[packages]
+dataclasses = "*"
+base32-crockford = "*"
+django = "*"
+youtube-dl = "*"
+python-crontab = "*"
+croniter = "*"
+
+[requires]
+python_version = ">=3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000000..9b05ded293
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,314 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "7f25fb9c97e469fdb787e755c5756e2be4b0b649e3c5ad8feb17200b32d3bb36"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": ">=3.6"
+ },
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "base32-crockford": {
+ "hashes": [
+ "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969",
+ "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"
+ ],
+ "index": "pypi",
+ "version": "==0.3.0"
+ },
+ "croniter": {
+ "hashes": [
+ "sha256:625949cbd38a0b2325295591940dfa5fa0dfca41d03150ae0284a924e0be10f0",
+ "sha256:66b6a9c6b2d1a85d4af51453b2328be775a173e688b69eb3a96a7ec752ba77a3"
+ ],
+ "index": "pypi",
+ "version": "==0.3.29"
+ },
+ "dataclasses": {
+ "hashes": [
+ "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f",
+ "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84"
+ ],
+ "index": "pypi",
+ "version": "==0.6"
+ },
+ "django": {
+ "hashes": [
+ "sha256:7c3543e4fb070d14e10926189a7fcf42ba919263b7473dceaefce34d54e8a119",
+ "sha256:a2814bffd1f007805b19194eb0b9a331933b82bd5da1c3ba3d7b7ba16e06dc4b"
+ ],
+ "index": "pypi",
+ "version": "==2.2"
+ },
+ "python-crontab": {
+ "hashes": [
+ "sha256:91ce4b245ee5e5c117aa0b21b485bc43f2d80df854a36e922b707643f50d7923"
+ ],
+ "index": "pypi",
+ "version": "==2.3.6"
+ },
+ "python-dateutil": {
+ "hashes": [
+ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+ "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+ ],
+ "version": "==2.8.0"
+ },
+ "pytz": {
+ "hashes": [
+ "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
+ "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"
+ ],
+ "version": "==2019.1"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ },
+ "sqlparse": {
+ "hashes": [
+ "sha256:40afe6b8d4b1117e7dff5504d7a8ce07d9a1b15aeeade8a2d10f130a834f8177",
+ "sha256:7c3dca29c022744e95b547e867cee89f4fce4373f3549ccd8797d8eb52cdb873"
+ ],
+ "version": "==0.3.0"
+ },
+ "youtube-dl": {
+ "hashes": [
+ "sha256:0d25459093870bf560bccafe9015e59402d7de1b2c956593623ba4c2840153e5",
+ "sha256:ea0824ae9a166059ec754c267480198a074bd899c20b2ba497809bac099cde2e"
+ ],
+ "index": "pypi",
+ "version": "==2019.4.17"
+ }
+ },
+ "develop": {
+ "appnope": {
+ "hashes": [
+ "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
+ "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
+ ],
+ "markers": "sys_platform == 'darwin'",
+ "version": "==0.1.0"
+ },
+ "backcall": {
+ "hashes": [
+ "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
+ "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
+ ],
+ "version": "==0.1.0"
+ },
+ "decorator": {
+ "hashes": [
+ "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
+ "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
+ ],
+ "version": "==4.4.0"
+ },
+ "django-stubs": {
+ "hashes": [
+ "sha256:9c06a4b28fc8c18f6abee4f199f8ee29cb5cfcecf349e912ded31cb3526ea2b6",
+ "sha256:9ef230843a24b5d74f2ebd4c60f9bea09c21911bc119d0325e8bb47e2f495e70"
+ ],
+ "index": "pypi",
+ "version": "==0.12.1"
+ },
+ "entrypoints": {
+ "hashes": [
+ "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
+ "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
+ ],
+ "version": "==0.3"
+ },
+ "flake8": {
+ "hashes": [
+ "sha256:859996073f341f2670741b51ec1e67a01da142831aa1fdc6242dbf88dffbe661",
+ "sha256:a796a115208f5c03b18f332f7c11729812c8c3ded6c46319c59b53efd3819da8"
+ ],
+ "index": "pypi",
+ "version": "==3.7.7"
+ },
+ "ipdb": {
+ "hashes": [
+ "sha256:dce2112557edfe759742ca2d0fee35c59c97b0cc7a05398b791079d78f1519ce"
+ ],
+ "index": "pypi",
+ "version": "==0.12"
+ },
+ "ipython": {
+ "hashes": [
+ "sha256:b038baa489c38f6d853a3cfc4c635b0cda66f2864d136fe8f40c1a6e334e2a6b",
+ "sha256:f5102c1cd67e399ec8ea66bcebe6e3968ea25a8977e53f012963e5affeb1fe38"
+ ],
+ "markers": "python_version >= '3.4'",
+ "version": "==7.4.0"
+ },
+ "ipython-genutils": {
+ "hashes": [
+ "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
+ "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
+ ],
+ "version": "==0.2.0"
+ },
+ "jedi": {
+ "hashes": [
+ "sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b",
+ "sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c"
+ ],
+ "version": "==0.13.3"
+ },
+ "mccabe": {
+ "hashes": [
+ "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+ "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ ],
+ "version": "==0.6.1"
+ },
+ "mypy": {
+ "hashes": [
+ "sha256:2afe51527b1f6cdc4a5f34fc90473109b22bf7f21086ba3e9451857cf11489e6",
+ "sha256:56a16df3e0abb145d8accd5dbb70eba6c4bd26e2f89042b491faa78c9635d1e2",
+ "sha256:5764f10d27b2e93c84f70af5778941b8f4aa1379b2430f85c827e0f5464e8714",
+ "sha256:5bbc86374f04a3aa817622f98e40375ccb28c4836f36b66706cf3c6ccce86eda",
+ "sha256:6a9343089f6377e71e20ca734cd8e7ac25d36478a9df580efabfe9059819bf82",
+ "sha256:6c9851bc4a23dc1d854d3f5dfd5f20a016f8da86bcdbb42687879bb5f86434b0",
+ "sha256:b8e85956af3fcf043d6f87c91cbe8705073fc67029ba6e22d3468bfee42c4823",
+ "sha256:b9a0af8fae490306bc112229000aa0c2ccc837b49d29a5c42e088c132a2334dd",
+ "sha256:bbf643528e2a55df2c1587008d6e3bda5c0445f1240dfa85129af22ae16d7a9a",
+ "sha256:c46ab3438bd21511db0f2c612d89d8344154c0c9494afc7fbc932de514cf8d15",
+ "sha256:f7a83d6bd805855ef83ec605eb01ab4fa42bcef254b13631e451cbb44914a9b0"
+ ],
+ "index": "pypi",
+ "version": "==0.701"
+ },
+ "mypy-extensions": {
+ "hashes": [
+ "sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812",
+ "sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e"
+ ],
+ "version": "==0.4.1"
+ },
+ "parso": {
+ "hashes": [
+ "sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33",
+ "sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376"
+ ],
+ "version": "==0.4.0"
+ },
+ "pexpect": {
+ "hashes": [
+ "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
+ "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+ ],
+ "markers": "sys_platform != 'win32'",
+ "version": "==4.7.0"
+ },
+ "pickleshare": {
+ "hashes": [
+ "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
+ "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
+ ],
+ "version": "==0.7.5"
+ },
+ "prompt-toolkit": {
+ "hashes": [
+ "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
+ "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
+ "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
+ ],
+ "version": "==2.0.9"
+ },
+ "ptyprocess": {
+ "hashes": [
+ "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
+ "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+ ],
+ "version": "==0.6.0"
+ },
+ "pycodestyle": {
+ "hashes": [
+ "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56",
+ "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"
+ ],
+ "version": "==2.5.0"
+ },
+ "pyflakes": {
+ "hashes": [
+ "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0",
+ "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2"
+ ],
+ "version": "==2.1.1"
+ },
+ "pygments": {
+ "hashes": [
+ "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
+ "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
+ ],
+ "version": "==2.3.1"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ },
+ "traitlets": {
+ "hashes": [
+ "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
+ "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
+ ],
+ "version": "==4.3.2"
+ },
+ "typed-ast": {
+ "hashes": [
+ "sha256:04894d268ba6eab7e093d43107869ad49e7b5ef40d1a94243ea49b352061b200",
+ "sha256:16616ece19daddc586e499a3d2f560302c11f122b9c692bc216e821ae32aa0d0",
+ "sha256:252fdae740964b2d3cdfb3f84dcb4d6247a48a6abe2579e8029ab3be3cdc026c",
+ "sha256:2af80a373af123d0b9f44941a46df67ef0ff7a60f95872412a145f4500a7fc99",
+ "sha256:2c88d0a913229a06282b285f42a31e063c3bf9071ff65c5ea4c12acb6977c6a7",
+ "sha256:2ea99c029ebd4b5a308d915cc7fb95b8e1201d60b065450d5d26deb65d3f2bc1",
+ "sha256:3d2e3ab175fc097d2a51c7a0d3fda442f35ebcc93bb1d7bd9b95ad893e44c04d",
+ "sha256:4766dd695548a15ee766927bf883fb90c6ac8321be5a60c141f18628fb7f8da8",
+ "sha256:56b6978798502ef66625a2e0f80cf923da64e328da8bbe16c1ff928c70c873de",
+ "sha256:5cddb6f8bce14325b2863f9d5ac5c51e07b71b462361fd815d1d7706d3a9d682",
+ "sha256:644ee788222d81555af543b70a1098f2025db38eaa99226f3a75a6854924d4db",
+ "sha256:64cf762049fc4775efe6b27161467e76d0ba145862802a65eefc8879086fc6f8",
+ "sha256:68c362848d9fb71d3c3e5f43c09974a0ae319144634e7a47db62f0f2a54a7fa7",
+ "sha256:6c1f3c6f6635e611d58e467bf4371883568f0de9ccc4606f17048142dec14a1f",
+ "sha256:b213d4a02eec4ddf622f4d2fbc539f062af3788d1f332f028a2e19c42da53f15",
+ "sha256:bb27d4e7805a7de0e35bd0cb1411bc85f807968b2b0539597a49a23b00a622ae",
+ "sha256:c9d414512eaa417aadae7758bc118868cd2396b0e6138c1dd4fda96679c079d3",
+ "sha256:f0937165d1e25477b01081c4763d2d9cdc3b18af69cb259dd4f640c9b900fe5e",
+ "sha256:fb96a6e2c11059ecf84e6741a319f93f683e440e341d4489c9b161eca251cf2a",
+ "sha256:fc71d2d6ae56a091a8d94f33ec9d0f2001d1cb1db423d8b4355debfe9ce689b7"
+ ],
+ "version": "==1.3.4"
+ },
+ "typing-extensions": {
+ "hashes": [
+ "sha256:07b2c978670896022a43c4b915df8958bec4a6b84add7f2c87b2b728bda3ba64",
+ "sha256:f3f0e67e1d42de47b5c67c32c9b26641642e9170fe7e292991793705cd5fef7c",
+ "sha256:fb2cd053238d33a8ec939190f30cfd736c00653a85a2919415cecf7dc3d9da71"
+ ],
+ "version": "==3.7.2"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ }
+ }
+}
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index 869724a35a..ae78531bba 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -1,30 +1,59 @@
__package__ = 'archivebox.cli'
import os
+
+from typing import Dict
from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
-required_attrs = ('__package__', '__command__', '__description__', 'main')
+# these common commands will appear sorted before any others for ease-of-use
+display_first = ('help', 'version', 'init', 'list', 'update', 'add', 'remove')
+# every imported command module must have these properties in order to be valid
+required_attrs = ('__package__', '__command__', 'main')
-order = ('help', 'version', 'init', 'list', 'update', 'add', 'remove')
+# basic checks to make sure imported files are valid subcommands
+is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
+is_valid_cli_module = lambda module, subcommand: (
+ all(hasattr(module, attr) for attr in required_attrs)
+ and module.__command__.split(' ')[-1] == subcommand
+)
+def list_subcommands() -> Dict[str, str]:
+ """find and import all valid archivebox_.py files in CLI_DIR"""
-def list_subcommands():
COMMANDS = []
for filename in os.listdir(CLI_DIR):
- if filename.startswith('archivebox_') and filename.endswith('.py'):
+ if is_cli_module(filename):
subcommand = filename.replace('archivebox_', '').replace('.py', '')
module = import_module('.archivebox_{}'.format(subcommand), __package__)
+ assert is_valid_cli_module(module, subcommand)
+ COMMANDS.append((subcommand, module.__description__)) # type: ignore
+ globals()[subcommand] = module.main
+ module.main.__doc__ = module.__description__
+
+ display_order = lambda cmd: (
+ display_first.index(cmd[0])
+ if cmd[0] in display_first else
+ 100 + len(cmd[0])
+ )
- assert all(hasattr(module, attr) for attr in required_attrs)
- assert module.__command__.split(' ')[-1] == subcommand
- COMMANDS.append((subcommand, module.__description__))
+ return dict(sorted(COMMANDS, key=display_order))
- return dict(sorted(COMMANDS, key=lambda cmd: order.index(cmd[0]) if cmd[0] in order else 10 + len(cmd[0])))
+def run_subcommand(subcommand: str, args=None) -> None:
+ """run a given ArchiveBox subcommand with the given list of args"""
-def run_subcommand(subcommand: str, args=None):
module = import_module('.archivebox_{}'.format(subcommand), __package__)
- return module.main(args) # type: ignore
+ module.main(args) # type: ignore
+
+
+SUBCOMMANDS = list_subcommands()
+
+__all__ = (
+ 'SUBCOMMANDS',
+ 'list_subcommands',
+ 'run_subcommand',
+ *SUBCOMMANDS.keys(),
+)
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index 26ea1e2d4d..33f5e9234e 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -82,5 +82,6 @@ def main(args=None, stdin=None):
only_new=command.only_new,
)
+
if __name__ == '__main__':
main()
diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py
index f5757f8c17..632b9a1ebd 100755
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -4,7 +4,6 @@
__command__ = 'archivebox init'
__description__ = 'Initialize a new ArchiveBox collection in the current directory'
-import os
import sys
import argparse
diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py
new file mode 100644
index 0000000000..44f4c73c4e
--- /dev/null
+++ b/archivebox/cli/archivebox_schedule.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox schedule'
+__description__ = 'Set ArchiveBox to run regularly at a specific time'
+
+import os
+import sys
+import argparse
+
+from datetime import datetime
+from crontab import CronTab, CronSlices
+
+
+from ..legacy.util import reject_stdin
+from ..legacy.config import (
+ OUTPUT_DIR,
+ LOGS_DIR,
+ ARCHIVEBOX_BINARY,
+ USER,
+ ANSI,
+ stderr,
+)
+
+
+CRON_COMMENT = 'archivebox_schedule'
+
+
+def main(args=None):
+ args = sys.argv[1:] if args is None else args
+
+ parser = argparse.ArgumentParser(
+ prog=__command__,
+ description=__description__,
+ add_help=True,
+ )
+ parser.add_argument(
+ '--quiet', '-q',
+ action='store_true',
+ help=("Don't warn about storage space."),
+ )
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument(
+ '--add', # '-a',
+ action='store_true',
+ help='Add a new scheduled ArchiveBox update job to cron',
+ )
+ parser.add_argument(
+ '--every', # '-e',
+ type=str,
+ default='daily',
+ help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',
+ )
+ group.add_argument(
+ '--clear', # '-c'
+ action='store_true',
+ help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
+ )
+ group.add_argument(
+ '--show', # '-s'
+ action='store_true',
+ help=("Print a list of currently active ArchiveBox cron jobs"),
+ )
+ group.add_argument(
+ '--foreground', '-f',
+ action='store_true',
+ help=("Launch ArchiveBox as a long-running foreground task "
+ "instead of using cron."),
+ )
+ group.add_argument(
+ '--run-all', # '-a',
+ action='store_true',
+ help='Run all the scheduled jobs once immediately, independent of their configured schedules',
+ )
+ parser.add_argument(
+ 'import_path',
+ nargs='?',
+ type=str,
+ default=None,
+ help=("Check this path and import any new links on every run "
+ "(can be either local file or remote URL)"),
+ )
+ command = parser.parse_args(args)
+ reject_stdin(__command__)
+
+ os.makedirs(LOGS_DIR, exist_ok=True)
+
+ cron = CronTab(user=True)
+ cron = dedupe_jobs(cron)
+
+ existing_jobs = list(cron.find_comment(CRON_COMMENT))
+ if command.foreground or command.run_all:
+ if command.import_path or (not existing_jobs):
+ stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
+ stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
+ raise SystemExit(1)
+ print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
+ if command.run_all:
+ try:
+ for job in existing_jobs:
+ sys.stdout.write(f' > {job.command}')
+ sys.stdout.flush()
+ job.run()
+ sys.stdout.write(f'\r √ {job.command}\n')
+ except KeyboardInterrupt:
+ print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+ raise SystemExit(1)
+ if command.foreground:
+ try:
+ for result in cron.run_scheduler():
+ print(result)
+ except KeyboardInterrupt:
+ print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+ raise SystemExit(1)
+
+ elif command.show:
+ if existing_jobs:
+ print('\n'.join(str(cmd) for cmd in existing_jobs))
+ else:
+ stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
+ stderr(' To schedule a new job, run:')
+ stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
+ raise SystemExit(0)
+
+ elif command.clear:
+ print(cron.remove_all(comment=CRON_COMMENT))
+ cron.write()
+ raise SystemExit(0)
+
+ elif command.every:
+ quoted = lambda s: f'"{s}"' if s and ' ' in s else s
+ cmd = [
+ 'cd',
+ quoted(OUTPUT_DIR),
+ '&&',
+ quoted(ARCHIVEBOX_BINARY),
+ *(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
+ '2>&1',
+ '>',
+ quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
+
+ ]
+ new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
+
+ if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
+ set_every = getattr(new_job.every(), command.every)
+ set_every()
+ elif CronSlices.is_valid(command.every):
+ new_job.setall(command.every)
+ else:
+ stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
+ stderr(' It must be one of minute/hour/day/week/month')
+ stderr(' or a quoted cron-format schedule like:')
+ stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
+ stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
+ raise SystemExit(1)
+
+ cron = dedupe_jobs(cron)
+ cron.write()
+
+ total_runs = sum(j.frequency_per_year() for j in cron)
+ existing_jobs = list(cron.find_comment(CRON_COMMENT))
+
+ print()
+ print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
+ print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
+ if total_runs > 60 and not command.quiet:
+ stderr()
+ stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
+ stderr(f' Congrats on being an enthusiastic internet archiver! 👌')
+ stderr()
+ stderr(' Make sure you have enough storage space available to hold all the data.')
+ stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
+ raise SystemExit(0)
+
+
+def dedupe_jobs(cron: CronTab) -> CronTab:
+ deduped = set()
+ for job in list(cron):
+ unique_tuple = (str(job.slices), job.command)
+ if unique_tuple not in deduped:
+ deduped.add(unique_tuple)
+ cron.remove(job)
+
+ for schedule, command in deduped:
+ job = cron.new(command=command, comment=CRON_COMMENT)
+ job.setall(schedule)
+ job.enable()
+
+ return cron
+
+
+if __name__ == '__main__':
+ main()
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index 4113ed10f4..1e1140ef26 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -7,7 +7,7 @@
import sys
import argparse
-from ..legacy.config import setup_django
+from ..legacy.config import setup_django, OUTPUT_DIR
from ..legacy.util import reject_stdin
@@ -29,7 +29,7 @@ def main(args=None):
command = parser.parse_args(args)
reject_stdin(__command__)
- setup_django()
+ setup_django(OUTPUT_DIR)
from django.core.management import call_command
call_command("runserver", *command.runserver_args)
diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py
index 6fc84c4080..3500edf27b 100644
--- a/archivebox/cli/archivebox_shell.py
+++ b/archivebox/cli/archivebox_shell.py
@@ -7,7 +7,7 @@
import sys
import argparse
-from ..legacy.config import setup_django
+from ..legacy.config import setup_django, OUTPUT_DIR
from ..legacy.util import reject_stdin
@@ -22,7 +22,7 @@ def main(args=None):
parser.parse_args(args)
reject_stdin(__command__)
- setup_django()
+ setup_django(OUTPUT_DIR)
from django.core.management import call_command
call_command("shell_plus")
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index ff1fbe674c..683f6d61f6 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -5,10 +5,8 @@
SECRET_KEY = '---------------- not a valid secret key ! ----------------'
DEBUG = True
-OUTPUT_DIR = os.path.abspath(os.curdir)
-DATABASE_DIR_NAME = 'database'
-DATABASE_FILE_NAME = 'database.sqlite3'
-DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
+OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir))
+DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3')
INSTALLED_APPS = [
@@ -38,7 +36,7 @@
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
- 'DIRS': ['templates'],
+ 'DIRS': ['themes'],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
diff --git a/archivebox/env.py b/archivebox/env.py
deleted file mode 100644
index 905fa2755f..0000000000
--- a/archivebox/env.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import os
-import sys
-
-
-PYTHON_DIR = os.path.dirname(os.path.abspath(__file__))
-
-sys.path.append(PYTHON_DIR)
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings")
-
-import django
-django.setup()
-
-from django.conf import settings
-
-DATABASE_FILE = settings.DATABASE_FILE
diff --git a/archivebox/legacy/config.py b/archivebox/legacy/config.py
index 64c4ce8780..82ec5a73f6 100644
--- a/archivebox/legacy/config.py
+++ b/archivebox/legacy/config.py
@@ -60,7 +60,6 @@
YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl')
CHROME_BINARY = os.getenv('CHROME_BINARY', None)
-
# ******************************************************************************
### Terminal Configuration
@@ -84,6 +83,7 @@ def stderr(*args):
sys.stderr.write(' '.join(str(a) for a in args) + '\n')
USER = getpass.getuser() or os.getlogin()
+ARCHIVEBOX_BINARY = sys.argv[0]
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))
if OUTPUT_DIR:
@@ -91,14 +91,15 @@ def stderr(*args):
else:
OUTPUT_DIR = os.path.abspath(os.curdir)
+SQL_INDEX_FILENAME = 'index.sqlite3'
+JSON_INDEX_FILENAME = 'index.json'
+HTML_INDEX_FILENAME = 'index.html'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
-DATABASE_DIR_NAME = 'database'
-DATABASE_FILE_NAME = 'database.sqlite3'
+LOGS_DIR_NAME = 'logs'
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
-DATABASE_DIR = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME)
-DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
+LOGS_DIR = os.path.join(OUTPUT_DIR, LOGS_DIR_NAME)
PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox')
LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy')
@@ -126,9 +127,10 @@ def stderr(*args):
raise SystemExit(1)
### Check Python environment
-python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
-if python_vers < 3.6:
- stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
+PYTHON_BINARY = sys.executable
+PYTHON_VERSION = '{}.{}'.format(sys.version_info.major, sys.version_info.minor)
+if float(PYTHON_VERSION) < 3.6:
+ stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], PYTHON_VERSION, ANSI['reset']))
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
@@ -150,6 +152,7 @@ def stderr(*args):
def bin_version(binary: str) -> Optional[str]:
"""check the presence and return valid version line of a specified binary"""
+
global HAS_INVALID_DEPENDENCIES
binary = os.path.expanduser(binary)
try:
@@ -223,12 +226,17 @@ def find_chrome_data_dir() -> Optional[str]:
return None
-def setup_django():
+def setup_django(out_dir: str=OUTPUT_DIR, check_db=False):
import django
sys.path.append(PYTHON_DIR)
+ os.environ.setdefault('OUTPUT_DIR', out_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
+ if check_db:
+ assert os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)), (
+ f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {out_dir}')
+
# ******************************************************************************
# ************************ Environment & Dependencies **************************
# ******************************************************************************
@@ -338,16 +346,16 @@ def setup_django():
'enabled': True,
'is_valid': os.path.exists(SOURCES_DIR),
},
+ 'LOGS_DIR': {
+ 'path': os.path.abspath(LOGS_DIR),
+ 'enabled': True,
+ 'is_valid': os.path.exists(LOGS_DIR),
+ },
'ARCHIVE_DIR': {
'path': os.path.abspath(ARCHIVE_DIR),
'enabled': True,
'is_valid': os.path.exists(ARCHIVE_DIR),
},
- 'DATABASE_DIR': {
- 'path': os.path.abspath(DATABASE_DIR),
- 'enabled': True,
- 'is_valid': os.path.exists(DATABASE_FILE),
- },
'CHROME_USER_DATA_DIR': {
'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR),
'enabled': USE_CHROME and CHROME_USER_DATA_DIR,
@@ -361,6 +369,12 @@ def setup_django():
}
DEPENDENCIES = {
+ 'PYTHON_BINARY': {
+ 'path': PYTHON_BINARY,
+ 'version': PYTHON_VERSION,
+ 'enabled': True,
+ 'is_valid': bool(DJANGO_VERSION),
+ },
'DJANGO_BINARY': {
'path': DJANGO_BINARY,
'version': DJANGO_VERSION,
diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py
index 173d6b7cf0..c063b1e2e9 100644
--- a/archivebox/legacy/index.py
+++ b/archivebox/legacy/index.py
@@ -1,13 +1,17 @@
+__package__ = 'archivebox.legacy'
+
import os
import json
from typing import List, Tuple, Optional, Iterable
from collections import OrderedDict
+from contextlib import contextmanager
from .schema import Link, ArchiveResult
from .config import (
- DATABASE_DIR,
- DATABASE_FILE_NAME,
+ SQL_INDEX_FILENAME,
+ JSON_INDEX_FILENAME,
+ HTML_INDEX_FILENAME,
OUTPUT_DIR,
TIMEOUT,
URL_BLACKLIST_PTN,
@@ -35,14 +39,13 @@
from .parse import parse_links
from .logs import (
log_indexing_process_started,
+ log_indexing_process_finished,
log_indexing_started,
log_indexing_finished,
log_parsing_started,
log_parsing_finished,
)
-
-
### Link filtering and checking
@enforce_types
@@ -117,7 +120,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
if not links:
- stderr('{red}[X] No links found in index.json{reset}'.format(**ANSI))
+ stderr('{red}[X] No links found in index.{reset}'.format(**ANSI))
stderr(' To add a link to your archive, run:')
stderr(" archivebox add 'https://example.com'")
stderr()
@@ -204,58 +207,63 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
### Main Links Index
+@contextmanager
@enforce_types
-def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
- """create index.html file for a given list of links"""
-
- log_indexing_process_started()
-
- log_indexing_started(DATABASE_DIR, DATABASE_FILE_NAME)
+def timed_index_update(out_path: str):
+ log_indexing_started(out_path)
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
- write_sql_main_index(links)
+ yield
finally:
timer.end()
- log_indexing_finished(DATABASE_DIR, DATABASE_FILE_NAME)
- log_indexing_started(out_dir, 'index.json')
- timer = TimedProgress(TIMEOUT * 2, prefix=' ')
- try:
+ assert os.path.exists(out_path), f'Failed to write index file: {out_path}'
+ log_indexing_finished(out_path)
+
+
+@enforce_types
+def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
+ """create index.html file for a given list of links"""
+
+ log_indexing_process_started(len(links))
+
+ with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
+ write_sql_main_index(links, out_dir=out_dir)
+
+ with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
write_json_main_index(links, out_dir=out_dir)
- finally:
- timer.end()
- log_indexing_finished(out_dir, 'index.json')
-
- log_indexing_started(out_dir, 'index.html')
- timer = TimedProgress(TIMEOUT * 2, prefix=' ')
- try:
+
+ with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
write_html_main_index(links, out_dir=out_dir, finished=finished)
- finally:
- timer.end()
- log_indexing_finished(out_dir, 'index.html')
+
+ log_indexing_process_finished()
@enforce_types
-def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
+def load_main_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
- existing_links: List[Link] = []
- if out_dir:
- existing_links = list(parse_json_main_index(out_dir))
- existing_sql_links = list(parse_sql_main_index())
- assert set(l.url for l in existing_links) == set(l['url'] for l in existing_sql_links)
+ all_links: List[Link] = []
+ all_links = list(parse_json_main_index(out_dir))
+ links_from_sql = list(parse_sql_main_index())
+ assert set(l.url for l in all_links) == set(l['url'] for l in links_from_sql)
+
+ return all_links
+
+@enforce_types
+def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
new_links: List[Link] = []
- if import_path:
- # parse and validate the import file
- log_parsing_started(import_path)
- raw_links, parser_name = parse_links(import_path)
- new_links = list(validate_links(raw_links))
+
+ # parse and validate the import file
+ log_parsing_started(import_path)
+ raw_links, parser_name = parse_links(import_path)
+ new_links = list(validate_links(raw_links))
# merge existing links in out_dir and new links
all_links = list(validate_links(existing_links + new_links))
- if import_path and parser_name:
+ if parser_name:
num_parsed = len(raw_links)
num_new_links = len(all_links) - len(existing_links)
log_parsing_finished(num_parsed, num_new_links, parser_name)
@@ -323,9 +331,3 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
return merge_links(existing_link, link)
return link
-
-
-
-
-
-
diff --git a/archivebox/legacy/logs.py b/archivebox/legacy/logs.py
index 0f3eb5dc84..8cb1362972 100644
--- a/archivebox/legacy/logs.py
+++ b/archivebox/legacy/logs.py
@@ -6,7 +6,7 @@
from typing import Optional, List
from .schema import Link, ArchiveResult
-from .config import ANSI, OUTPUT_DIR
+from .config import ANSI, OUTPUT_DIR, IS_TTY
@dataclass
@@ -42,7 +42,7 @@ def pretty_path(path: str) -> str:
def log_parsing_started(source_file: str):
start_ts = datetime.now()
_LAST_RUN_STATS.parse_start_ts = start_ts
- print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
+ print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1],
**ANSI,
@@ -56,22 +56,26 @@ def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
### Indexing Stage
-def log_indexing_process_started():
+def log_indexing_process_started(num_links: int):
start_ts = datetime.now()
_LAST_RUN_STATS.index_start_ts = start_ts
print()
- print('{green}[*] [{}] Saving main index files...{reset}'.format(
+ print('{green}[*] [{}] Updating {} links in main index...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
+ num_links,
**ANSI,
))
-def log_indexing_started(out_dir: str, out_file: str):
- sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
-
-def log_indexing_finished(out_dir: str, out_file: str):
+def log_indexing_process_finished():
end_ts = datetime.now()
_LAST_RUN_STATS.index_end_ts = end_ts
- print('\r √ {}/{}'.format(out_dir, out_file))
+
+def log_indexing_started(out_path: str):
+ if IS_TTY:
+ sys.stdout.write(f' > {out_path}')
+
+def log_indexing_finished(out_path: str):
+ print(f'\r √ {out_path}')
### Archiving Stage
@@ -108,7 +112,7 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
print(' To view your archive, open:')
print(' {}/index.html'.format(OUTPUT_DIR))
print(' Continue archiving where you left off by running:')
- print(' archivebox {}'.format(timestamp))
+ print(' archivebox update --resume={}'.format(timestamp))
def log_archiving_finished(num_links: int):
end_ts = datetime.now()
diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py
index 0dd4ffd668..7296add0a8 100644
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@@ -9,6 +9,7 @@
from .index import (
links_after_timestamp,
load_main_index,
+ import_new_links,
write_main_index,
)
from .archive_methods import archive_link
@@ -19,8 +20,9 @@
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
- DATABASE_DIR,
- DATABASE_FILE,
+ LOGS_DIR,
+ JSON_INDEX_FILENAME,
+ SQL_INDEX_FILENAME,
check_dependencies,
check_data_folder,
setup_django,
@@ -36,60 +38,85 @@
)
+ALLOWED_IN_OUTPUT_DIR = {
+ '.DS_Store',
+ '.venv',
+ 'venv',
+ 'virtualenv',
+ '.virtualenv',
+ 'sources',
+ 'archive',
+ 'logs',
+ 'static',
+}
+
+
@enforce_types
def init():
os.makedirs(OUTPUT_DIR, exist_ok=True)
- harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv', 'sources', 'archive', 'database', 'logs', 'static'}
- is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files)
- existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
+ is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
+ existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
if is_empty:
- stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
- write_main_index([], out_dir=OUTPUT_DIR, finished=True)
+ print('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
+ print('{green}----------------------------------------------------------------{reset}'.format(**ANSI))
else:
if existing_index:
- stderr('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI))
- stderr(f' {OUTPUT_DIR}')
- stderr(f' > index.html')
- stderr(f' > index.json')
+ print('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI))
+ print('{green}----------------------------------------------------------------{reset}'.format(**ANSI))
+ print(f' {OUTPUT_DIR}')
else:
stderr(
- ("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
+ ("{red}[X] This folder appears to have non-ArchiveBox files in it. You must run 'archivebox init' inside a completely empty directory.{reset}"
"\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
- " just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
+ " just cd into the folder and run 'archivebox update' to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(OUTPUT_DIR, **ANSI)
)
raise SystemExit(1)
os.makedirs(SOURCES_DIR, exist_ok=True)
- stderr(f' > sources/')
+ print(f' > {SOURCES_DIR}')
+
os.makedirs(ARCHIVE_DIR, exist_ok=True)
- stderr(f' > archive/')
- os.makedirs(DATABASE_DIR, exist_ok=True)
+ print(f' > {ARCHIVE_DIR}')
- setup_django()
- from django.core.management import call_command
- from django.contrib.auth.models import User
- stderr(f' > database/')
+ os.makedirs(LOGS_DIR, exist_ok=True)
+ print(f' > {LOGS_DIR}')
- stderr('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
+ print('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
+ setup_django(OUTPUT_DIR, check_db=False)
+ from django.core.management import call_command
+ from django.conf import settings
+ assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
+ print(f' {settings.DATABASE_FILE}')
+
+
call_command("makemigrations", interactive=False)
call_command("migrate", interactive=False)
+
+ assert os.path.exists(settings.DATABASE_FILE)
- if not User.objects.filter(is_superuser=True).exists():
- stderr('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
- call_command("createsuperuser", interactive=True)
+ # from django.contrib.auth.models import User
+ # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
+ # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
+ # call_command("createsuperuser", interactive=True)
+
+ if existing_index:
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+ write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
+ else:
+ write_main_index([], out_dir=OUTPUT_DIR)
- stderr('\n{green}------------------------------------------------------------{reset}'.format(**ANSI))
- stderr('{green}[√] Done. ArchiveBox collection is set up in current folder.{reset}'.format(**ANSI))
- stderr(' To add new links, you can run:')
- stderr(" archivebox add 'https://example.com'")
- stderr()
- stderr(' For more usage and examples, run:')
- stderr(' archivebox help')
+ print('\n{green}----------------------------------------------------------------{reset}'.format(**ANSI))
+ print('{green}[√] Done. ArchiveBox collection is set up in the current folder.{reset}'.format(**ANSI))
+ print(' To add new links, you can run:')
+ print(" archivebox add 'https://example.com'")
+ print()
+ print(' For more usage and examples, run:')
+ print(' archivebox help')
@@ -102,7 +129,11 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
- all_links, new_links = load_main_index(out_dir=OUTPUT_DIR, import_path=import_path)
+ all_links: List[Link] = []
+ new_links: List[Link] = []
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+ if import_path:
+ all_links, new_links = import_new_links(all_links, import_path)
# Step 2: Write updated index with deduped old and new links back to disk
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
@@ -127,7 +158,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links
@@ -152,7 +183,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
if after is not None and float(link.timestamp) < after:
@@ -198,7 +229,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
timer = TimedProgress(360, prefix=' ')
try:
to_keep = []
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
should_remove = (
(after is not None and float(link.timestamp) < after)
diff --git a/archivebox/legacy/storage/html.py b/archivebox/legacy/storage/html.py
index bc58cb566e..dd2d2b92bb 100644
--- a/archivebox/legacy/storage/html.py
+++ b/archivebox/legacy/storage/html.py
@@ -13,6 +13,7 @@
GIT_SHA,
FOOTER_INFO,
ARCHIVE_DIR_NAME,
+ HTML_INDEX_FILENAME,
)
from ..util import (
enforce_types,
@@ -44,7 +45,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static'))
rendered_html = main_index_template(links, finished=finished)
- atomic_write(rendered_html, join(out_dir, 'index.html'))
+ atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
@enforce_types
@@ -100,7 +101,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link)
- atomic_write(rendered_html, join(out_dir, 'index.html'))
+ atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
@enforce_types
diff --git a/archivebox/legacy/storage/json.py b/archivebox/legacy/storage/json.py
index 697d318b02..183f397562 100644
--- a/archivebox/legacy/storage/json.py
+++ b/archivebox/legacy/storage/json.py
@@ -1,6 +1,7 @@
__package__ = 'archivebox.legacy.storage'
import os
+import sys
import json
from datetime import datetime
@@ -10,12 +11,33 @@
from ..config import (
VERSION,
OUTPUT_DIR,
+ FOOTER_INFO,
+ GIT_SHA,
+ DEPENDENCIES,
+ JSON_INDEX_FILENAME,
)
from ..util import (
enforce_types,
atomic_write,
)
+MAIN_INDEX_HEADER = {
+ 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
+ 'schema': 'archivebox.legacy.storage.json',
+ 'copyright_info': FOOTER_INFO,
+ 'meta': {
+ 'project': 'ArchiveBox',
+ 'cmd': sys.argv,
+ 'version': VERSION,
+ 'git_sha': GIT_SHA,
+ 'website': 'https://ArchiveBox.io',
+ 'docs': 'https://github.com/pirate/ArchiveBox/wiki',
+ 'source': 'https://github.com/pirate/ArchiveBox',
+ 'issues': 'https://github.com/pirate/ArchiveBox/issues',
+ 'dependencies': DEPENDENCIES,
+ },
+}
+
### Main Links Index
@@ -23,7 +45,7 @@
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links"""
- index_path = os.path.join(out_dir, 'index.json')
+ index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
@@ -46,18 +68,13 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
if links and links[0].sources:
assert isinstance(links[0].sources[0], str)
- path = os.path.join(out_dir, 'index.json')
-
- index_json = {
- 'info': 'ArchiveBox Index',
- 'source': 'https://github.com/pirate/ArchiveBox',
- 'docs': 'https://github.com/pirate/ArchiveBox/wiki',
- 'version': VERSION,
+ main_index_json = {
+ **MAIN_INDEX_HEADER,
'num_links': len(links),
'updated': datetime.now(),
'links': links,
}
- atomic_write(index_json, path)
+ atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
### Link Details Index
@@ -67,7 +84,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
out_dir = out_dir or link.link_dir
- path = os.path.join(out_dir, 'index.json')
+ path = os.path.join(out_dir, JSON_INDEX_FILENAME)
atomic_write(link._asdict(extended=True), path)
@@ -75,7 +92,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
@enforce_types
def parse_json_link_details(out_dir: str) -> Optional[Link]:
"""load the json link index from a given directory"""
- existing_index = os.path.join(out_dir, 'index.json')
+ existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)
diff --git a/archivebox/legacy/storage/sql.py b/archivebox/legacy/storage/sql.py
index 90a0c41225..be6bfbe2c2 100644
--- a/archivebox/legacy/storage/sql.py
+++ b/archivebox/legacy/storage/sql.py
@@ -4,14 +4,14 @@
from ..schema import Link
from ..util import enforce_types
-from ..config import setup_django
+from ..config import setup_django, OUTPUT_DIR
### Main Links Index
@enforce_types
-def parse_sql_main_index() -> Iterator[Link]:
- setup_django()
+def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
+ setup_django(out_dir, check_db=True)
from core.models import Page
return (
@@ -20,8 +20,8 @@ def parse_sql_main_index() -> Iterator[Link]:
)
@enforce_types
-def write_sql_main_index(links: List[Link]) -> None:
- setup_django()
+def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
+ setup_django(out_dir, check_db=True)
from core.models import Page
for link in links:
diff --git a/archivebox/tests.py b/archivebox/tests.py
index 6afb6c7d6c..108617dafb 100755
--- a/archivebox/tests.py
+++ b/archivebox/tests.py
@@ -27,6 +27,11 @@
from .legacy.main import init
from .legacy.index import load_main_index
+from .legacy.config import (
+ SQL_INDEX_FILENAME,
+ JSON_INDEX_FILENAME,
+ HTML_INDEX_FILENAME,
+)
from .cli import (
archivebox_init,
@@ -55,12 +60,12 @@
htt://example15.badc
'''
+stdout = sys.stdout
+stderr = sys.stderr
+
@contextmanager
def output_hidden(show_failing=True):
- stdout = sys.stdout
- stderr = sys.stderr
-
if not HIDE_CLI_OUTPUT:
yield
return
@@ -100,6 +105,11 @@ def test_basic_init(self):
with output_hidden():
archivebox_init.main([])
+ assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
+ assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
+ assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
+ assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
+
def test_conflicting_init(self):
with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
f.write('test')
@@ -108,9 +118,25 @@ def test_conflicting_init(self):
with output_hidden(show_failing=False):
archivebox_init.main([])
assert False, 'Init should have exited with an exception'
+ except SystemExit:
+ pass
+
+ assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
+ assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
+ assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
+ try:
+ load_main_index(out_dir=OUTPUT_DIR)
+ assert False, 'load_main_index should raise an exception when no index is present'
except:
pass
+ def test_no_dirty_state(self):
+ with output_hidden():
+ init()
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+ with output_hidden():
+ init()
+
class TestAdd(unittest.TestCase):
def setUp(self):
@@ -125,7 +151,7 @@ def test_add_arg_url(self):
with output_hidden():
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 30
def test_add_arg_file(self):
@@ -136,7 +162,7 @@ def test_add_arg_file(self):
with output_hidden():
archivebox_add.main([test_file])
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12
os.remove(test_file)
@@ -144,7 +170,7 @@ def test_add_stdin_url(self):
with output_hidden():
archivebox_add.main([], stdin=test_urls)
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12
@@ -155,29 +181,29 @@ def setUp(self):
init()
archivebox_add.main([], stdin=test_urls)
- def tearDown(self):
- shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+ # def tearDown(self):
+ # shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_remove_exact(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 11
def test_remove_regex(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', 'http(s)?:\/\/(.+\.)?(example\d\.com)'])
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 4
def test_remove_domain(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
- all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 10
def test_remove_none(self):
@@ -190,4 +216,7 @@ def test_remove_none(self):
if __name__ == '__main__':
+ if '--verbose' in sys.argv or '-v' in sys.argv:
+ HIDE_CLI_OUTPUT = False
+
unittest.main()
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d7b43bc14e..0000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-dataclasses
-django
-base32-crockford
-
-setuptools
-ipdb
-mypy
-django-stubs
-flake8
-
-#wpull
-#pywb
-#pyppeteer
-#GitPython
-#youtube-dl
-#archivenow
-#requests
diff --git a/setup.py b/setup.py
index 1c048d8a6d..34adc14b02 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
'Bug Tracker': 'https://github.com/pirate/ArchiveBox/issues',
'Roadmap': 'https://github.com/pirate/ArchiveBox/wiki/Roadmap',
'Changelog': 'https://github.com/pirate/ArchiveBox/wiki/Changelog',
- 'Donations': 'https://github.com/pirate/ArchiveBox/wiki/Donations',
+ 'Patreon': 'https://github.com/pirate/ArchiveBox/wiki/Donations',
},
packages=setuptools.find_packages(),
python_requires='>=3.6',
@@ -40,6 +40,15 @@
"base32-crockford==0.3.0",
"django==2.2",
"django-extensions==2.1.6",
+ "youtube-dl",
+
+ # Some/all of these will likely be added in the future:
+ # wpull
+ # pywb
+ # pyppeteer
+ # archivenow
+ # requests
+
],
entry_points={
'console_scripts': [
From f489dd96a987be58266c528914154d3a75973d1d Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 13:19:47 -0400
Subject: [PATCH 0039/3688] fix archivebox remove rejecting stdin patterns
---
archivebox/cli/archivebox_remove.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py
index 26bf826291..a413f8cb33 100644
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -60,7 +60,6 @@ def main(args=None):
help='URLs matching this filter pattern will be removed from the index.'
)
command = parser.parse_args(args)
- reject_stdin(__command__)
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
From 354895aef161801eba4e050a8f7838310b2e5c6d Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 13:20:19 -0400
Subject: [PATCH 0040/3688] django admin to view links now working
---
archivebox/core/admin.py | 10 +++++++++-
archivebox/core/models.py | 28 ++++++++++++++++++++++++++++
archivebox/core/urls.py | 2 +-
archivebox/legacy/schema.py | 3 ++-
4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index 8c38f3f3da..b61d93d6f2 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -1,3 +1,11 @@
from django.contrib import admin
-# Register your models here.
+from .models import Page
+
+class PageAdmin(admin.ModelAdmin):
+ list_display = ('timestamp', 'short_url', 'title', 'is_archived', 'num_outputs', 'added', 'updated', 'url_hash')
+
+ def short_url(self, obj):
+ return obj.url[:64]
+
+admin.site.register(Page, PageAdmin)
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 6fdcdae2c0..94258b1a31 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -4,6 +4,8 @@
from django.db import models
+from legacy.schema import Link
+
class Page(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
@@ -20,6 +22,13 @@ class Page(models.Model):
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
+
+ def __repr__(self) -> str:
+ return f'[{self.timestamp}] {self.url[:64]} ({self.title[:64]})'
+
+ def __str__(self) -> str:
+ return f'[{self.timestamp}] {self.url[:64]} ({self.title[:64]})'
+
@classmethod
def from_json(cls, info: dict):
info = {k: v for k, v in info.items() if k in cls.keys}
@@ -31,3 +40,22 @@ def as_json(self, *args) -> dict:
key: getattr(self, key)
for key in args
}
+
+ def as_link(self) -> Link:
+ return Link.from_json(self.as_json())
+
+ @property
+ def is_archived(self):
+ return self.as_link().is_archived
+
+ @property
+ def num_outputs(self):
+ return self.as_link().num_outputs
+
+ @property
+ def url_hash(self):
+ return self.as_link().url_hash
+
+ @property
+ def base_url(self):
+ return self.as_link().base_url
diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py
index a105c91c94..3a2cb8264a 100644
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -5,7 +5,7 @@
from core.views import MainIndex, LinkDetails
urlpatterns = [
- path('', admin.site.urls),
+ path('admin/', admin.site.urls),
path('archive//', LinkDetails.as_view(), name='LinkDetails'),
path('main/', MainIndex.as_view(), name='Home'),
]
diff --git a/archivebox/legacy/schema.py b/archivebox/legacy/schema.py
index 38f2ec95e5..2c0cf0335f 100644
--- a/archivebox/legacy/schema.py
+++ b/archivebox/legacy/schema.py
@@ -181,8 +181,9 @@ def from_json(cls, json_info):
if key in cls.field_names()
}
info['updated'] = parse_date(info['updated'])
+ info['sources'] = info.get('sources') or []
- json_history = info['history']
+ json_history = info.get('history') or {}
cast_history = {}
for method, method_history in json_history.items():
From 168e578ea4c1ed892501717266e0906cd97ea8bd Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 13:21:08 -0400
Subject: [PATCH 0041/3688] fix bad default in scheduler
---
archivebox/cli/archivebox_schedule.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py
index 44f4c73c4e..652e63b776 100644
--- a/archivebox/cli/archivebox_schedule.py
+++ b/archivebox/cli/archivebox_schedule.py
@@ -48,7 +48,7 @@ def main(args=None):
parser.add_argument(
'--every', # '-e',
type=str,
- default='daily',
+ default='day',
help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',
)
group.add_argument(
From 29ced7b5c85fba071aa38109b8396e13df6b1258 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 13:36:27 -0400
Subject: [PATCH 0042/3688] allow running archivebox core commands from
manage.py
---
archivebox/core/management/commands/archivebox.py | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/archivebox/core/management/commands/archivebox.py b/archivebox/core/management/commands/archivebox.py
index c3c236e5dc..a68b5d94a5 100644
--- a/archivebox/core/management/commands/archivebox.py
+++ b/archivebox/core/management/commands/archivebox.py
@@ -1,11 +1,18 @@
+__package__ = 'archivebox'
+
from django.core.management.base import BaseCommand
-from legacy.archive import main
+from .cli import run_subcommand
class Command(BaseCommand):
- help = 'ArchiveBox test.bee'
+ help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)'
+
+ def add_arguments(self, parser):
+ parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
+ parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
+
def handle(self, *args, **kwargs):
- main(*args)
+ run_subcommand(kwargs['subcommand'], args=kwargs['command_args'])
From 50b947f41d72596cdf8d21c8e029a8da235c13f2 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 14:34:12 -0400
Subject: [PATCH 0043/3688] add md5 hashes to dependencies dict
---
archivebox/legacy/config.py | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/archivebox/legacy/config.py b/archivebox/legacy/config.py
index 82ec5a73f6..b7b519ef47 100644
--- a/archivebox/legacy/config.py
+++ b/archivebox/legacy/config.py
@@ -1,12 +1,14 @@
__package__ = 'archivebox.legacy'
import os
+import io
import re
import sys
import django
import getpass
import shutil
+from hashlib import md5
from typing import Optional
from subprocess import run, PIPE, DEVNULL
@@ -173,6 +175,18 @@ def bin_version(binary: str) -> Optional[str]:
stderr()
return None
+def bin_hash(binary: str) -> Optional[str]:
+ bin_path = binary and shutil.which(os.path.expanduser(binary))
+ if not bin_path:
+ return None
+
+ file_hash = md5()
+ with io.open(bin_path, mode='rb') as f:
+ for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
+ file_hash.update(chunk)
+
+ return f'md5:{file_hash.hexdigest()}'
+
def find_chrome_binary() -> Optional[str]:
"""find any installed chrome binaries in the default locations"""
@@ -372,42 +386,49 @@ def setup_django(out_dir: str=OUTPUT_DIR, check_db=False):
'PYTHON_BINARY': {
'path': PYTHON_BINARY,
'version': PYTHON_VERSION,
+ 'hash': bin_hash(PYTHON_BINARY),
'enabled': True,
'is_valid': bool(DJANGO_VERSION),
},
'DJANGO_BINARY': {
'path': DJANGO_BINARY,
'version': DJANGO_VERSION,
+ 'hash': bin_hash(DJANGO_BINARY),
'enabled': True,
'is_valid': bool(DJANGO_VERSION),
},
'CURL_BINARY': {
'path': CURL_BINARY and shutil.which(CURL_BINARY),
'version': CURL_VERSION,
+ 'hash': bin_hash(PYTHON_BINARY),
'enabled': USE_CURL,
'is_valid': bool(CURL_VERSION),
},
'WGET_BINARY': {
'path': WGET_BINARY and shutil.which(WGET_BINARY),
'version': WGET_VERSION,
+ 'hash': bin_hash(WGET_BINARY),
'enabled': USE_WGET,
'is_valid': bool(WGET_VERSION),
},
'GIT_BINARY': {
'path': GIT_BINARY and shutil.which(GIT_BINARY),
'version': GIT_VERSION,
+ 'hash': bin_hash(GIT_BINARY),
'enabled': FETCH_GIT,
'is_valid': bool(GIT_VERSION),
},
'YOUTUBEDL_BINARY': {
'path': YOUTUBEDL_BINARY and shutil.which(YOUTUBEDL_BINARY),
'version': YOUTUBEDL_VERSION,
+ 'hash': bin_hash(YOUTUBEDL_BINARY),
'enabled': FETCH_MEDIA,
'is_valid': bool(YOUTUBEDL_VERSION),
},
'CHROME_BINARY': {
'path': CHROME_BINARY and shutil.which(CHROME_BINARY),
'version': CHROME_VERSION,
+ 'hash': bin_hash(CHROME_BINARY),
'enabled': USE_CHROME,
'is_valid': bool(CHROME_VERSION),
},
From ab6881933286a38f28043fe284d0cc53be0773ab Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 14:34:30 -0400
Subject: [PATCH 0044/3688] add archivebox info command to scan data dir
---
archivebox/cli/archivebox_info.py | 28 ++++++++++++++++++
archivebox/legacy/main.py | 48 ++++++++++++++++++++++++++++++-
archivebox/legacy/storage/json.py | 2 +-
archivebox/legacy/util.py | 30 ++++++++++++++++++-
4 files changed, 105 insertions(+), 3 deletions(-)
create mode 100644 archivebox/cli/archivebox_info.py
diff --git a/archivebox/cli/archivebox_info.py b/archivebox/cli/archivebox_info.py
new file mode 100644
index 0000000000..38d7eb4895
--- /dev/null
+++ b/archivebox/cli/archivebox_info.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox info'
+__description__ = 'Print out some info and statistics about the archive collection'
+
+import sys
+import argparse
+
+from ..legacy.main import info
+from ..legacy.util import reject_stdin
+
+
+def main(args=None):
+ args = sys.argv[1:] if args is None else args
+
+ parser = argparse.ArgumentParser(
+ prog=__command__,
+ description=__description__,
+ add_help=True,
+ )
+ parser.parse_args(args)
+ reject_stdin(__command__)
+
+ info()
+
+if __name__ == '__main__':
+ main()
diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py
index 7296add0a8..49e4903bd7 100644
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@@ -5,7 +5,12 @@
from typing import List, Optional, Iterable
from .schema import Link
-from .util import enforce_types, TimedProgress
+from .util import (
+ enforce_types,
+ TimedProgress,
+ get_dir_size,
+ human_readable_size,
+)
from .index import (
links_after_timestamp,
load_main_index,
@@ -119,6 +124,47 @@ def init():
print(' archivebox help')
+@enforce_types
+def info():
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+
+ print('{green}[*] Scanning archive collection main index with {} links:{reset}'.format(len(all_links), **ANSI))
+ print(f' {OUTPUT_DIR}')
+
+ num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False)
+ size = human_readable_size(num_bytes)
+ print(f' > Index Size: {size} across {num_files} files in')
+ print()
+
+ print('{green}[*] Scanning archive collection data directory with {} entries:{reset}'.format(len(all_links), **ANSI))
+ print(f' {ARCHIVE_DIR}')
+
+ num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
+ size = human_readable_size(num_bytes)
+ print(f' > Total Size: {size} across {num_files} files in {num_dirs} directories')
+ print()
+
+ link_data_dirs = {link.link_dir for link in all_links}
+ valid_archive_dirs = set()
+ num_invalid = 0
+ for entry in os.scandir(ARCHIVE_DIR):
+ if entry.is_dir(follow_symlinks=True):
+ if os.path.exists(os.path.join(entry.path, 'index.json')):
+ valid_archive_dirs.add(entry.path)
+ else:
+ num_invalid += 1
+
+ print(f' > {len(valid_archive_dirs)} valid archive data directories (valid directories matched to links in the index)')
+
+ num_unarchived = sum(1 for link in all_links if link.link_dir not in valid_archive_dirs)
+ print(f' > {num_unarchived} missing data directories (directories missing for links in the index)')
+
+ print(f' > {num_invalid} invalid data directories (directories present that don\'t contain an index file)')
+
+ num_orphaned = sum(1 for data_dir in valid_archive_dirs if data_dir not in link_data_dirs)
+ print(f' > {num_orphaned} orphaned data directories (directories present for links that don\'t exist in the index)')
+
+
@enforce_types
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
diff --git a/archivebox/legacy/storage/json.py b/archivebox/legacy/storage/json.py
index 183f397562..a602762829 100644
--- a/archivebox/legacy/storage/json.py
+++ b/archivebox/legacy/storage/json.py
@@ -27,7 +27,6 @@
'copyright_info': FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
- 'cmd': sys.argv,
'version': VERSION,
'git_sha': GIT_SHA,
'website': 'https://ArchiveBox.io',
@@ -72,6 +71,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
**MAIN_INDEX_HEADER,
'num_links': len(links),
'updated': datetime.now(),
+ 'last_run_cmd': sys.argv,
'links': links,
}
atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
diff --git a/archivebox/legacy/util.py b/archivebox/legacy/util.py
index c4f1432855..e30782fa1a 100644
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@@ -7,7 +7,7 @@
from string import Template
from json import JSONEncoder
-from typing import List, Optional, Any, Union, IO, Mapping
+from typing import List, Optional, Any, Union, IO, Mapping, Tuple
from inspect import signature
from functools import wraps
from hashlib import sha256
@@ -561,6 +561,34 @@ def copy_and_overwrite(from_path: str, to_path: str):
with open(from_path, 'rb') as src:
atomic_write(src.read(), to_path)
+
+@enforce_types
+def get_dir_size(path: str, recursive: bool=True) -> Tuple[int, int, int]:
+ num_bytes, num_dirs, num_files = 0, 0, 0
+ for entry in os.scandir(path):
+ if entry.is_dir(follow_symlinks=False):
+ if not recursive:
+ continue
+ num_dirs += 1
+ bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
+ num_bytes += bytes_inside
+ num_dirs += dirs_inside
+ num_files += files_inside
+ else:
+ num_bytes += entry.stat(follow_symlinks=False).st_size
+ num_files += 1
+ return num_bytes, num_dirs, num_files
+
+
+@enforce_types
+def human_readable_size(num_bytes: Union[int, float]) -> str:
+ for count in ['Bytes','KB','MB','GB']:
+ if num_bytes > -1024.0 and num_bytes < 1024.0:
+ return '%3.1f%s' % (num_bytes, count)
+ num_bytes /= 1024.0
+ return '%3.1f%s' % (num_bytes, 'TB')
+
+
@enforce_types
def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
From 2f0dbeebc1988e4238639221ae8ae6b91043e3bf Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 14:42:04 -0400
Subject: [PATCH 0045/3688] update docstrings and comments
---
archivebox/cli/__init__.py | 2 +-
archivebox/cli/archivebox_help.py | 5 +++--
archivebox/cli/archivebox_schedule.py | 2 +-
archivebox/legacy/main.py | 2 +-
4 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index ae78531bba..082acf3888 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -8,7 +8,7 @@
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
# these common commands will appear sorted before any others for ease-of-use
-display_first = ('help', 'version', 'init', 'list', 'update', 'add', 'remove')
+display_first = ('help', 'version', 'init', 'info', 'list', 'update', 'add', 'remove')
# every imported command module must have these properties in order to be valid
required_attrs = ('__package__', '__command__', 'main')
diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py
index 1ef4922332..b049ef70e6 100755
--- a/archivebox/cli/archivebox_help.py
+++ b/archivebox/cli/archivebox_help.py
@@ -40,13 +40,14 @@ def main(args=None):
{lightblue}Example Use:{reset}
mkdir my-archive; cd my-archive/
archivebox init
+ archivebox info
archivebox add https://example.com/some/page
archivebox add --depth=1 ~/Downloads/bookmarks_export.html
- archivebox subscribe https://example.com/some/feed.rss
- archivebox update --resume=15109948213.123
archivebox list --sort=timestamp --csv=timestamp,url,is_archived
+ archivebox schedule --every=week https://example.com/some/feed.rss
+ archivebox update --resume=15109948213.123
{lightblue}Documentation:{reset}
https://github.com/pirate/ArchiveBox/wiki
diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py
index 652e63b776..09c5a92061 100644
--- a/archivebox/cli/archivebox_schedule.py
+++ b/archivebox/cli/archivebox_schedule.py
@@ -2,7 +2,7 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox schedule'
-__description__ = 'Set ArchiveBox to run regularly at a specific time'
+__description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'
import os
import sys
diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py
index 49e4903bd7..3ecdc887bc 100644
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@@ -133,7 +133,7 @@ def info():
num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False)
size = human_readable_size(num_bytes)
- print(f' > Index Size: {size} across {num_files} files in')
+ print(f' > Index Size: {size} across {num_files} files')
print()
print('{green}[*] Scanning archive collection data directory with {} entries:{reset}'.format(len(all_links), **ANSI))
From bb10171f99e22583534580fcdc03942f252e6072 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 19:06:12 -0400
Subject: [PATCH 0046/3688] add missing dependencies to setup and pipfile
---
Pipfile | 3 +-
Pipfile.lock | 114 ++++++++++++++++++++++++++++++++++++++++++++++++---
setup.py | 2 +
3 files changed, 112 insertions(+), 7 deletions(-)
diff --git a/Pipfile b/Pipfile
index 4ba4d08e4b..d511dfb88d 100644
--- a/Pipfile
+++ b/Pipfile
@@ -17,6 +17,7 @@ django = "*"
youtube-dl = "*"
python-crontab = "*"
croniter = "*"
+ipython = "*"
[requires]
-python_version = ">=3.6"
+python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
index 9b05ded293..331c202278 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,11 +1,11 @@
{
"_meta": {
"hash": {
- "sha256": "7f25fb9c97e469fdb787e755c5756e2be4b0b649e3c5ad8feb17200b32d3bb36"
+ "sha256": "a28212eba2c7ffc28d5af0cac4a754440b72b5b088ef7825c255cdfa33e5a047"
},
"pipfile-spec": 6,
"requires": {
- "python_version": ">=3.6"
+ "python_version": "3.7"
},
"sources": [
{
@@ -16,6 +16,21 @@
]
},
"default": {
+ "appnope": {
+ "hashes": [
+ "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
+ "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
+ ],
+ "markers": "sys_platform == 'darwin'",
+ "version": "==0.1.0"
+ },
+ "backcall": {
+ "hashes": [
+ "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
+ "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
+ ],
+ "version": "==0.1.0"
+ },
"base32-crockford": {
"hashes": [
"sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969",
@@ -26,11 +41,11 @@
},
"croniter": {
"hashes": [
- "sha256:625949cbd38a0b2325295591940dfa5fa0dfca41d03150ae0284a924e0be10f0",
- "sha256:66b6a9c6b2d1a85d4af51453b2328be775a173e688b69eb3a96a7ec752ba77a3"
+ "sha256:0d905dbe6f131a910fd3dde792f0129788cd2cb3a8048c5f7aaa212670b0cef2",
+ "sha256:538adeb3a7f7816c3cdec6db974c441620d764c25ff4ed0146ee7296b8a50590"
],
"index": "pypi",
- "version": "==0.3.29"
+ "version": "==0.3.30"
},
"dataclasses": {
"hashes": [
@@ -40,6 +55,13 @@
"index": "pypi",
"version": "==0.6"
},
+ "decorator": {
+ "hashes": [
+ "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
+ "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
+ ],
+ "version": "==4.4.0"
+ },
"django": {
"hashes": [
"sha256:7c3543e4fb070d14e10926189a7fcf42ba919263b7473dceaefce34d54e8a119",
@@ -48,6 +70,72 @@
"index": "pypi",
"version": "==2.2"
},
+ "ipython": {
+ "hashes": [
+ "sha256:b038baa489c38f6d853a3cfc4c635b0cda66f2864d136fe8f40c1a6e334e2a6b",
+ "sha256:f5102c1cd67e399ec8ea66bcebe6e3968ea25a8977e53f012963e5affeb1fe38"
+ ],
+ "index": "pypi",
+ "version": "==7.4.0"
+ },
+ "ipython-genutils": {
+ "hashes": [
+ "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
+ "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
+ ],
+ "version": "==0.2.0"
+ },
+ "jedi": {
+ "hashes": [
+ "sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b",
+ "sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c"
+ ],
+ "version": "==0.13.3"
+ },
+ "parso": {
+ "hashes": [
+ "sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33",
+ "sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376"
+ ],
+ "version": "==0.4.0"
+ },
+ "pexpect": {
+ "hashes": [
+ "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
+ "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+ ],
+ "markers": "sys_platform != 'win32'",
+ "version": "==4.7.0"
+ },
+ "pickleshare": {
+ "hashes": [
+ "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
+ "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
+ ],
+ "version": "==0.7.5"
+ },
+ "prompt-toolkit": {
+ "hashes": [
+ "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
+ "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
+ "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
+ ],
+ "version": "==2.0.9"
+ },
+ "ptyprocess": {
+ "hashes": [
+ "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
+ "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+ ],
+ "version": "==0.6.0"
+ },
+ "pygments": {
+ "hashes": [
+ "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
+ "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
+ ],
+ "version": "==2.3.1"
+ },
"python-crontab": {
"hashes": [
"sha256:91ce4b245ee5e5c117aa0b21b485bc43f2d80df854a36e922b707643f50d7923"
@@ -83,6 +171,20 @@
],
"version": "==0.3.0"
},
+ "traitlets": {
+ "hashes": [
+ "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
+ "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
+ ],
+ "version": "==4.3.2"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ },
"youtube-dl": {
"hashes": [
"sha256:0d25459093870bf560bccafe9015e59402d7de1b2c956593623ba4c2840153e5",
@@ -150,7 +252,7 @@
"sha256:b038baa489c38f6d853a3cfc4c635b0cda66f2864d136fe8f40c1a6e334e2a6b",
"sha256:f5102c1cd67e399ec8ea66bcebe6e3968ea25a8977e53f012963e5affeb1fe38"
],
- "markers": "python_version >= '3.4'",
+ "index": "pypi",
"version": "==7.4.0"
},
"ipython-genutils": {
diff --git a/setup.py b/setup.py
index 34adc14b02..b4db4f5428 100644
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,9 @@
"base32-crockford==0.3.0",
"django==2.2",
"django-extensions==2.1.6",
+ "python-crontab",
"youtube-dl",
+ "ipython",
# Some/all of these will likely be added in the future:
# wpull
From f0f516e853e38886c58aadda852c11376d4bb44a Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 19:06:48 -0400
Subject: [PATCH 0047/3688] check for data folder when running most subcommands
---
archivebox/cli/archivebox.py | 9 ++++++++-
archivebox/cli/archivebox_add.py | 3 ++-
archivebox/cli/archivebox_info.py | 5 ++++-
archivebox/cli/archivebox_list.py | 3 +++
archivebox/cli/archivebox_remove.py | 5 ++++-
archivebox/cli/archivebox_schedule.py | 3 +++
archivebox/cli/archivebox_server.py | 15 ++++++++++++++-
archivebox/cli/archivebox_shell.py | 4 +++-
archivebox/cli/archivebox_update.py | 3 +++
9 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/archivebox/cli/archivebox.py b/archivebox/cli/archivebox.py
index 803bd9a989..d1326721a2 100755
--- a/archivebox/cli/archivebox.py
+++ b/archivebox/cli/archivebox.py
@@ -5,10 +5,12 @@
__command__ = 'archivebox'
__description__ = 'ArchiveBox: The self-hosted internet archive.'
+import os
import sys
import argparse
from . import list_subcommands, run_subcommand
+from ..legacy.config import OUTPUT_DIR
def parse_args(args=None):
@@ -78,8 +80,13 @@ def print_import_tutorial():
def main(args=None):
subcommand, subcommand_args = parse_args(args)
+ existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
+
if subcommand is None:
- print_import_tutorial()
+ if existing_index:
+ run_subcommand('help', subcommand_args)
+ else:
+ print_import_tutorial()
raise SystemExit(0)
run_subcommand(subcommand, subcommand_args)
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index 33f5e9234e..241c3f88ca 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -16,6 +16,8 @@
def main(args=None, stdin=None):
+ check_data_folder()
+
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
@@ -55,7 +57,6 @@ def main(args=None, stdin=None):
command = parser.parse_args(args)
check_dependencies()
- check_data_folder()
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | archivebox add)
diff --git a/archivebox/cli/archivebox_info.py b/archivebox/cli/archivebox_info.py
index 38d7eb4895..bf04d89e80 100644
--- a/archivebox/cli/archivebox_info.py
+++ b/archivebox/cli/archivebox_info.py
@@ -7,11 +7,14 @@
import sys
import argparse
-from ..legacy.main import info
+from ..legacy.config import check_data_folder
from ..legacy.util import reject_stdin
+from ..legacy.main import info
def main(args=None):
+ check_data_folder()
+
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py
index d421f8de90..dd4b62f893 100644
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@@ -9,10 +9,13 @@
from ..legacy.util import reject_stdin, to_json, to_csv
+from ..legacy.config import check_data_folder
from ..legacy.main import list_archive_data
def main(args=None):
+ check_data_folder()
+
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py
index a413f8cb33..4ddba35473 100644
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -8,11 +8,14 @@
import argparse
-from ..legacy.main import remove_archive_links
+from ..legacy.config import check_data_folder
from ..legacy.util import reject_stdin
+from ..legacy.main import remove_archive_links
def main(args=None):
+ check_data_folder()
+
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py
index 09c5a92061..f6e685f84b 100644
--- a/archivebox/cli/archivebox_schedule.py
+++ b/archivebox/cli/archivebox_schedule.py
@@ -20,6 +20,7 @@
USER,
ANSI,
stderr,
+ check_data_folder,
)
@@ -27,6 +28,8 @@
def main(args=None):
+ check_data_folder()
+
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index 1e1140ef26..2955812a50 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -7,11 +7,13 @@
import sys
import argparse
-from ..legacy.config import setup_django, OUTPUT_DIR
+from ..legacy.config import setup_django, OUTPUT_DIR, ANSI, check_data_folder
from ..legacy.util import reject_stdin
def main(args=None):
+ check_data_folder()
+
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
@@ -26,11 +28,22 @@ def main(args=None):
default=None,
help='Arguments to pass to Django runserver'
)
+ parser.add_argument(
+ '--reload',
+ action='store_true',
+ help='Enable auto-reloading when code or templates change',
+ )
command = parser.parse_args(args)
reject_stdin(__command__)
setup_django(OUTPUT_DIR)
from django.core.management import call_command
+
+
+ print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
+ if not command.reload:
+ command.runserver_args.append('--noreload')
+
call_command("runserver", *command.runserver_args)
diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py
index 3500edf27b..dd509e3faa 100644
--- a/archivebox/cli/archivebox_shell.py
+++ b/archivebox/cli/archivebox_shell.py
@@ -7,11 +7,13 @@
import sys
import argparse
-from ..legacy.config import setup_django, OUTPUT_DIR
+from ..legacy.config import setup_django, OUTPUT_DIR, check_data_folder
from ..legacy.util import reject_stdin
def main(args=None):
+ check_data_folder()
+
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index c74fc8b71d..e80fdce54b 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -8,11 +8,14 @@
import argparse
+from ..legacy.config import check_data_folder
from ..legacy.util import reject_stdin
from ..legacy.main import update_archive_data
def main(args=None):
+ check_data_folder()
+
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
From 834aaa159101082dc36227541f5e6005732bf2e3 Mon Sep 17 00:00:00 2001
From: Nick Sweeting
Date: Mon, 22 Apr 2019 19:07:39 -0400
Subject: [PATCH 0048/3688] better template staticfile management with themes
dir
---
archivebox/core/settings.py | 55 +++-
archivebox/core/urls.py | 23 +-
archivebox/themes/admin/login.html | 100 +++++++
archivebox/themes/default/add_links.html | 209 +++++++++++++++
archivebox/themes/default/main_index.html | 243 ++++++++++++++++++
archivebox/themes/static/archive.png | Bin 0 -> 17730 bytes
archivebox/themes/static/bootstrap.min.css | 6 +
archivebox/themes/static/external.png | Bin 0 -> 1647 bytes
.../themes/static/jquery.dataTables.min.css | 1 +
.../themes/static/jquery.dataTables.min.js | 166 ++++++++++++
archivebox/themes/static/jquery.min.js | 2 +
archivebox/themes/static/sort_asc.png | Bin 0 -> 158 bytes
archivebox/themes/static/sort_both.png | Bin 0 -> 201 bytes
archivebox/themes/static/sort_desc.png | Bin 0 -> 157 bytes
archivebox/themes/static/spinner.gif | Bin 0 -> 10949 bytes
15 files changed, 798 insertions(+), 7 deletions(-)
create mode 100644 archivebox/themes/admin/login.html
create mode 100644 archivebox/themes/default/add_links.html
create mode 100644 archivebox/themes/default/main_index.html
create mode 100644 archivebox/themes/static/archive.png
create mode 100644 archivebox/themes/static/bootstrap.min.css
create mode 100755 archivebox/themes/static/external.png
create mode 100644 archivebox/themes/static/jquery.dataTables.min.css
create mode 100644 archivebox/themes/static/jquery.dataTables.min.js
create mode 100644 archivebox/themes/static/jquery.min.js
create mode 100755 archivebox/themes/static/sort_asc.png
create mode 100755 archivebox/themes/static/sort_both.png
create mode 100755 archivebox/themes/static/sort_desc.png
create mode 100644 archivebox/themes/static/spinner.gif
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 683f6d61f6..ce5300aafa 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -1,20 +1,27 @@
__package__ = 'archivebox.core'
import os
+import sys
SECRET_KEY = '---------------- not a valid secret key ! ----------------'
DEBUG = True
+ALLOWED_HOSTS = ['*']
+REPO_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), os.path.pardir, os.path.pardir))
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir))
DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3')
+ACTIVE_THEME = 'default'
+
+IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
INSTALLED_APPS = [
- 'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
+ # 'django.contrib.sites',
'django.contrib.messages',
+ 'django.contrib.admin',
'django.contrib.staticfiles',
'core',
@@ -22,6 +29,7 @@
'django_extensions',
]
+
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
@@ -29,14 +37,18 @@
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
- 'django.middleware.clickjacking.XFrameOptionsMiddleware',
+ # 'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'core.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
- 'DIRS': ['themes'],
+ 'DIRS': [
+ os.path.join(REPO_DIR, 'themes', ACTIVE_THEME),
+ os.path.join(REPO_DIR, 'themes', 'default'),
+ os.path.join(REPO_DIR, 'themes'),
+ ],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
@@ -58,6 +70,9 @@
}
}
+AUTHENTICATION_BACKENDS = [
+ 'django.contrib.auth.backends.ModelBackend',
+]
AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
{'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
@@ -65,6 +80,29 @@
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
]
+################################################################################
+### Security Settings
+################################################################################
+SECURE_BROWSER_XSS_FILTER = True
+SECURE_CONTENT_TYPE_NOSNIFF = True
+SESSION_COOKIE_SECURE = False
+CSRF_COOKIE_SECURE = False
+SESSION_COOKIE_DOMAIN = None
+SESSION_EXPIRE_AT_BROWSER_CLOSE = False
+SESSION_SAVE_EVERY_REQUEST = True
+SESSION_COOKIE_AGE = 1209600 # 2 weeks
+LOGIN_URL = '/accounts/login/'
+LOGOUT_REDIRECT_URL = '/'
+PASSWORD_RESET_URL = '/accounts/password_reset/'
+
+
+SHELL_PLUS = 'ipython'
+SHELL_PLUS_PRINT_SQL = False
+IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
+IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
+if IS_SHELL:
+ os.environ['PYTHONSTARTUP'] = os.path.join(REPO_DIR, 'core', 'welcome_message.py')
+
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
@@ -73,4 +111,15 @@
USE_TZ = False
+EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
+
STATIC_URL = '/static/'
+STATICFILES_DIRS = [
+ os.path.join(REPO_DIR, 'themes', ACTIVE_THEME, 'static'),
+ os.path.join(REPO_DIR, 'themes', 'default', 'static'),
+ os.path.join(REPO_DIR, 'themes', 'static'),
+]
+
+SERVE_STATIC = True
+
+
diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py
index 3a2cb8264a..e29b2971c7 100644
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -1,11 +1,26 @@
from django.contrib import admin
-from django.urls import path
+from django.utils.translation import ugettext_lazy
+from django.urls import path, include
+from django.conf import settings
-from core.views import MainIndex, LinkDetails
+from core.views import MainIndex, AddLinks, LinkDetails
+
+admin.site.site_header = 'ArchiveBox Admin'
+admin.site.index_title = 'Archive Administration'
urlpatterns = [
- path('admin/', admin.site.urls),
path('archive//', LinkDetails.as_view(), name='LinkDetails'),
- path('main/', MainIndex.as_view(), name='Home'),
+ path('accounts/', include('django.contrib.auth.urls')),
+ path('admin/', admin.site.urls),
+ path('add/', AddLinks.as_view(), name='AddLinks'),
+ path('', MainIndex.as_view(), name='Home'),
]
+
+
+if settings.SERVE_STATIC:
+ # serve staticfiles via runserver
+ from django.contrib.staticfiles import views
+ urlpatterns += [
+ path('static/', views.serve),
+ ]
diff --git a/archivebox/themes/admin/login.html b/archivebox/themes/admin/login.html
new file mode 100644
index 0000000000..a6d8eac730
--- /dev/null
+++ b/archivebox/themes/admin/login.html
@@ -0,0 +1,100 @@
+{% extends "admin/base_site.html" %}
+{% load i18n static %}
+
+{% block extrastyle %}{{ block.super }}
+{{ form.media }}
+{% endblock %}
+
+{% block bodyclass %}{{ block.super }} login{% endblock %}
+
+{% block branding %}
ArchiveBox Admin
{% endblock %}
+
+{% block usertools %}
+
+ Back to Main Index
+{% endblock %}
+
+{% block nav-global %}{% endblock %}
+
+{% block content_title %}
+
+ Log in to add, edit, and remove links from your archive.
+
+
+{% endblock %}
+
+{% block breadcrumbs %}{% endblock %}
+
+{% block content %}
+{% if form.errors and not form.non_field_errors %}
+
+{% if form.errors.items|length == 1 %}{% trans "Please correct the error below." %}{% else %}{% trans "Please correct the errors below." %}{% endif %}
+
+{% endif %}
+
+{% if form.non_field_errors %}
+{% for error in form.non_field_errors %}
+
+ {{ error }}
+
+{% endfor %}
+{% endif %}
+
+
+
+{% if user.is_authenticated %}
+
+{% blocktrans trimmed %}
+ You are authenticated as {{ username }}, but are not authorized to
+ access this page. Would you like to login to a different account?
+{% endblocktrans %}
+
+{% endif %}
+
+
+
+
+
+
+
+
+ If you forgot your password, reset it here or run:
+
+archivebox manage changepassword USERNAME
+
+
+
+
+
+ To create a new admin user, run the following:
+
+archivebox manage createsuperuser
+
+
+
+
+ (cd into your archive folder before running commands)
+