diff --git a/aurweb/archives/__init__.py b/aurweb/archives/__init__.py new file mode 100644 index 00000000..47020641 --- /dev/null +++ b/aurweb/archives/__init__.py @@ -0,0 +1 @@ +# aurweb.archives diff --git a/aurweb/archives/spec/__init__.py b/aurweb/archives/spec/__init__.py new file mode 100644 index 00000000..b6e376b4 --- /dev/null +++ b/aurweb/archives/spec/__init__.py @@ -0,0 +1 @@ +# aurweb.archives.spec diff --git a/aurweb/archives/spec/base.py b/aurweb/archives/spec/base.py new file mode 100644 index 00000000..60f734f2 --- /dev/null +++ b/aurweb/archives/spec/base.py @@ -0,0 +1,77 @@ +from pathlib import Path +from typing import Any, Dict, Iterable, List, Set + + +class GitInfo: + """Information about a Git repository.""" + + """ Path to Git repository. """ + path: str + + """ Local Git repository configuration. """ + config: Dict[str, Any] + + def __init__(self, path: str, config: Dict[str, Any] = dict()) -> "GitInfo": + self.path = Path(path) + self.config = config + + +class SpecOutput: + """Class used for git_archive.py output details.""" + + """ Filename relative to the Git repository root. """ + filename: Path + + """ Git repository information. """ + git_info: GitInfo + + """ Bytes bound for `SpecOutput.filename`. """ + data: bytes + + def __init__(self, filename: str, git_info: GitInfo, data: bytes) -> "SpecOutput": + self.filename = filename + self.git_info = git_info + self.data = data + + +class SpecBase: + """ + Base for Spec classes defined in git_archve.py --spec modules. + + All supported --spec modules must contain the following classes: + - Spec(SpecBase) + """ + + """ A list of SpecOutputs, each of which contain output file data. """ + outputs: List[SpecOutput] = list() + + """ A set of repositories to commit changes to. """ + repos: Set[str] = set() + + def generate(self) -> Iterable[SpecOutput]: + """ + "Pure virtual" output generator. + + `SpecBase.outputs` and `SpecBase.repos` should be populated within an + overridden version of this function in SpecBase derivatives. + """ + raise NotImplementedError() + + def add_output(self, filename: str, git_info: GitInfo, data: bytes) -> None: + """ + Add a SpecOutput instance to the set of outputs. + + :param filename: Filename relative to the git repository root + :param git_info: GitInfo instance + :param data: Binary data bound for `filename` + """ + if git_info.path not in self.repos: + self.repos.add(git_info.path) + + self.outputs.append( + SpecOutput( + filename, + git_info, + data, + ) + ) diff --git a/aurweb/archives/spec/metadata.py b/aurweb/archives/spec/metadata.py new file mode 100644 index 00000000..e7c8e096 --- /dev/null +++ b/aurweb/archives/spec/metadata.py @@ -0,0 +1,85 @@ +from typing import Iterable + +import orjson + +from aurweb import config, db +from aurweb.models import Package, PackageBase, User +from aurweb.rpc import RPC + +from .base import GitInfo, SpecBase, SpecOutput + +ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2 + + +class Spec(SpecBase): + def __init__(self) -> "Spec": + self.metadata_repo = GitInfo( + config.get("git-archive", "metadata-repo"), + ) + + def generate(self) -> Iterable[SpecOutput]: + # Base query used by the RPC. + base_query = ( + db.query(Package) + .join(PackageBase) + .join(User, PackageBase.MaintainerUID == User.ID) + ) + + # Create an instance of RPC, use it to get entities from + # our query and perform a metadata subquery for all packages. + rpc = RPC(version=5, type="info") + print("performing package database query") + packages = rpc.entities(base_query).all() + print("performing package database subqueries") + rpc.subquery({pkg.ID for pkg in packages}) + + pkgbases, pkgnames = dict(), dict() + for package in packages: + # Produce RPC type=info data for `package` + data = rpc.get_info_json_data(package) + + pkgbase_name = data.get("PackageBase") + pkgbase_data = { + "ID": data.pop("PackageBaseID"), + "URLPath": data.pop("URLPath"), + "FirstSubmitted": data.pop("FirstSubmitted"), + "LastModified": data.pop("LastModified"), + "OutOfDate": data.pop("OutOfDate"), + "Maintainer": data.pop("Maintainer"), + "Keywords": data.pop("Keywords"), + "NumVotes": data.pop("NumVotes"), + "Popularity": data.pop("Popularity"), + "PopularityUpdated": package.PopularityUpdated.timestamp(), + } + + # Store the data in `pkgbases` dict. We do this so we only + # end up processing a single `pkgbase` if repeated after + # this loop + pkgbases[pkgbase_name] = pkgbase_data + + # Remove Popularity and NumVotes from package data. + # These fields change quite often which causes git data + # modification to explode. + # data.pop("NumVotes") + # data.pop("Popularity") + + # Remove the ID key from package json. + data.pop("ID") + + # Add the `package`.Name to the pkgnames set + name = data.get("Name") + pkgnames[name] = data + + # Add metadata outputs + self.add_output( + "pkgname.json", + self.metadata_repo, + orjson.dumps(pkgnames, option=ORJSON_OPTS), + ) + self.add_output( + "pkgbase.json", + self.metadata_repo, + orjson.dumps(pkgbases, option=ORJSON_OPTS), + ) + + return self.outputs diff --git a/aurweb/archives/spec/pkgbases.py b/aurweb/archives/spec/pkgbases.py new file mode 100644 index 00000000..9f02c1c6 --- /dev/null +++ b/aurweb/archives/spec/pkgbases.py @@ -0,0 +1,32 @@ +from typing import Iterable + +import orjson + +from aurweb import config, db +from aurweb.models import PackageBase + +from .base import GitInfo, SpecBase, SpecOutput + +ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2 + + +class Spec(SpecBase): + def __init__(self) -> "Spec": + self.pkgbases_repo = GitInfo(config.get("git-archive", "pkgbases-repo")) + + def generate(self) -> Iterable[SpecOutput]: + filt = PackageBase.PackagerUID.isnot(None) + query = ( + db.query(PackageBase.Name) + .filter(filt) + .order_by(PackageBase.Name.asc()) + .all() + ) + pkgbases = [pkgbase.Name for pkgbase in query] + + self.add_output( + "pkgbase.json", + self.pkgbases_repo, + orjson.dumps(pkgbases, option=ORJSON_OPTS), + ) + return self.outputs diff --git a/aurweb/archives/spec/pkgnames.py b/aurweb/archives/spec/pkgnames.py new file mode 100644 index 00000000..c7cd9ea7 --- /dev/null +++ b/aurweb/archives/spec/pkgnames.py @@ -0,0 +1,33 @@ +from typing import Iterable + +import orjson + +from aurweb import config, db +from aurweb.models import Package, PackageBase + +from .base import GitInfo, SpecBase, SpecOutput + +ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2 + + +class Spec(SpecBase): + def __init__(self) -> "Spec": + self.pkgnames_repo = GitInfo(config.get("git-archive", "pkgnames-repo")) + + def generate(self) -> Iterable[SpecOutput]: + filt = PackageBase.PackagerUID.isnot(None) + query = ( + db.query(Package.Name) + .join(PackageBase, PackageBase.ID == Package.PackageBaseID) + .filter(filt) + .order_by(Package.Name.asc()) + .all() + ) + pkgnames = [pkg.Name for pkg in query] + + self.add_output( + "pkgname.json", + self.pkgnames_repo, + orjson.dumps(pkgnames, option=ORJSON_OPTS), + ) + return self.outputs diff --git a/aurweb/archives/spec/users.py b/aurweb/archives/spec/users.py new file mode 100644 index 00000000..80da1641 --- /dev/null +++ b/aurweb/archives/spec/users.py @@ -0,0 +1,26 @@ +from typing import Iterable + +import orjson + +from aurweb import config, db +from aurweb.models import User + +from .base import GitInfo, SpecBase, SpecOutput + +ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2 + + +class Spec(SpecBase): + def __init__(self) -> "Spec": + self.users_repo = GitInfo(config.get("git-archive", "users-repo")) + + def generate(self) -> Iterable[SpecOutput]: + query = db.query(User.Username).order_by(User.Username.asc()).all() + users = [user.Username for user in query] + + self.add_output( + "users.json", + self.users_repo, + orjson.dumps(users, option=ORJSON_OPTS), + ) + return self.outputs diff --git a/aurweb/models/package_base.py b/aurweb/models/package_base.py index bf80233d..26d9165f 100644 --- a/aurweb/models/package_base.py +++ b/aurweb/models/package_base.py @@ -64,3 +64,13 @@ class PackageBase(Base): if key in PackageBase.TO_FLOAT and not isinstance(attr, float): return float(attr) return attr + + +def popularity_decay(pkgbase: PackageBase, utcnow: int): + """Return the delta between now and the last time popularity was updated, in days""" + return int((utcnow - pkgbase.PopularityUpdated.timestamp()) / 86400) + + +def popularity(pkgbase: PackageBase, utcnow: int): + """Return up-to-date popularity""" + return float(pkgbase.Popularity) * (0.98 ** popularity_decay(pkgbase, utcnow)) diff --git a/aurweb/pkgbase/util.py b/aurweb/pkgbase/util.py index 968135d1..46d6e2db 100644 --- a/aurweb/pkgbase/util.py +++ b/aurweb/pkgbase/util.py @@ -3,8 +3,9 @@ from typing import Any from fastapi import Request from sqlalchemy import and_ -from aurweb import config, db, defaults, l10n, util +from aurweb import config, db, defaults, l10n, time, util from aurweb.models import PackageBase, User +from aurweb.models.package_base import popularity from aurweb.models.package_comaintainer import PackageComaintainer from aurweb.models.package_comment import PackageComment from aurweb.models.package_request import PENDING_ID, PackageRequest @@ -81,6 +82,8 @@ def make_context( and_(PackageRequest.Status == PENDING_ID, PackageRequest.ClosedTS.is_(None)) ).count() + context["popularity"] = popularity(pkgbase, time.utcnow()) + return context diff --git a/aurweb/rpc.py b/aurweb/rpc.py index 26677f80..515c6ffb 100644 --- a/aurweb/rpc.py +++ b/aurweb/rpc.py @@ -6,9 +6,10 @@ from fastapi.responses import HTMLResponse from sqlalchemy import and_, literal, orm import aurweb.config as config -from aurweb import db, defaults, models +from aurweb import db, defaults, models, time from aurweb.exceptions import RPCError from aurweb.filters import number_format +from aurweb.models.package_base import popularity from aurweb.packages.search import RPCSearch TYPE_MAPPING = { @@ -120,16 +121,15 @@ class RPC: if not args: raise RPCError("No request type/data specified.") - def _get_json_data(self, package: models.Package) -> dict[str, Any]: + def get_json_data(self, package: models.Package) -> dict[str, Any]: """Produce dictionary data of one Package that can be JSON-serialized. :param package: Package instance :returns: JSON-serializable dictionary """ - # Produce RPC API compatible Popularity: If zero, it's an integer - # 0, otherwise, it's formatted to the 6th decimal place. - pop = package.Popularity + # Normalize Popularity for RPC output to 6 decimal precision + pop = popularity(package, time.utcnow()) pop = 0 if not pop else float(number_format(pop, 6)) snapshot_uri = config.get("options", "snapshot_uri") @@ -151,8 +151,8 @@ class RPC: "LastModified": package.ModifiedTS, } - def _get_info_json_data(self, package: models.Package) -> dict[str, Any]: - data = self._get_json_data(package) + def get_info_json_data(self, package: models.Package) -> dict[str, Any]: + data = self.get_json_data(package) # All info results have _at least_ an empty list of # License and Keywords. @@ -176,7 +176,7 @@ class RPC: """ return [data_generator(pkg) for pkg in packages] - def _entities(self, query: orm.Query) -> orm.Query: + def entities(self, query: orm.Query) -> orm.Query: """Select specific RPC columns on `query`.""" return query.with_entities( models.Package.ID, @@ -188,38 +188,14 @@ class RPC: models.PackageBase.Name.label("PackageBaseName"), models.PackageBase.NumVotes, models.PackageBase.Popularity, + models.PackageBase.PopularityUpdated, models.PackageBase.OutOfDateTS, models.PackageBase.SubmittedTS, models.PackageBase.ModifiedTS, models.User.Username.label("Maintainer"), ).group_by(models.Package.ID) - def _handle_multiinfo_type( - self, args: list[str] = [], **kwargs - ) -> list[dict[str, Any]]: - self._enforce_args(args) - args = set(args) - - packages = ( - db.query(models.Package) - .join(models.PackageBase) - .join( - models.User, - models.User.ID == models.PackageBase.MaintainerUID, - isouter=True, - ) - .filter(models.Package.Name.in_(args)) - ) - - max_results = config.getint("options", "max_rpc_results") - packages = self._entities(packages).limit(max_results + 1) - - if packages.count() > max_results: - raise RPCError("Too many package results.") - - ids = {pkg.ID for pkg in packages} - - # Aliases for 80-width. + def subquery(self, ids: set[int]): Package = models.Package PackageKeyword = models.PackageKeyword @@ -311,7 +287,33 @@ class RPC: self.extra_info[record.ID][type_].append(name) - return self._assemble_json_data(packages, self._get_info_json_data) + def _handle_multiinfo_type( + self, args: list[str] = [], **kwargs + ) -> list[dict[str, Any]]: + self._enforce_args(args) + args = set(args) + + packages = ( + db.query(models.Package) + .join(models.PackageBase) + .join( + models.User, + models.User.ID == models.PackageBase.MaintainerUID, + isouter=True, + ) + .filter(models.Package.Name.in_(args)) + ) + + max_results = config.getint("options", "max_rpc_results") + packages = self.entities(packages).limit(max_results + 1) + + if packages.count() > max_results: + raise RPCError("Too many package results.") + + ids = {pkg.ID for pkg in packages} + self.subquery(ids) + + return self._assemble_json_data(packages, self.get_info_json_data) def _handle_search_type( self, by: str = defaults.RPC_SEARCH_BY, args: list[str] = [] @@ -330,12 +332,12 @@ class RPC: search.search_by(by, arg) max_results = config.getint("options", "max_rpc_results") - results = self._entities(search.results()).limit(max_results + 1).all() + results = self.entities(search.results()).limit(max_results + 1).all() if len(results) > max_results: raise RPCError("Too many package results.") - return self._assemble_json_data(results, self._get_json_data) + return self._assemble_json_data(results, self.get_json_data) def _handle_msearch_type( self, args: list[str] = [], **kwargs diff --git a/aurweb/schema.py b/aurweb/schema.py index b3b36195..5f998ed9 100644 --- a/aurweb/schema.py +++ b/aurweb/schema.py @@ -155,6 +155,12 @@ PackageBases = Table( nullable=False, server_default=text("0"), ), + Column( + "PopularityUpdated", + TIMESTAMP, + nullable=False, + server_default=text("'1970-01-01 00:00:01.000000'"), + ), Column("OutOfDateTS", BIGINT(unsigned=True)), Column("FlaggerComment", Text, nullable=False), Column("SubmittedTS", BIGINT(unsigned=True), nullable=False), diff --git a/aurweb/scripts/git_archive.py b/aurweb/scripts/git_archive.py new file mode 100644 index 00000000..4c909c18 --- /dev/null +++ b/aurweb/scripts/git_archive.py @@ -0,0 +1,125 @@ +import argparse +import importlib +import os +import sys +import traceback +from datetime import datetime + +import orjson +import pygit2 + +from aurweb import config + +# Constants +REF = "refs/heads/master" +ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2 + + +def init_repository(git_info) -> None: + pygit2.init_repository(git_info.path) + repo = pygit2.Repository(git_info.path) + for k, v in git_info.config.items(): + repo.config[k] = v + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--spec", + type=str, + required=True, + help="name of spec module in the aurweb.archives.spec package", + ) + return parser.parse_args() + + +def update_repository(repo: pygit2.Repository): + # Use git status to determine file changes + has_changes = False + changes = repo.status() + for filepath, flags in changes.items(): + if flags != pygit2.GIT_STATUS_CURRENT: + has_changes = True + break + + if has_changes: + print("diff detected, committing") + # Add everything in the tree. + print("adding files to git tree") + + # Add the tree to staging + repo.index.read() + repo.index.add_all() + repo.index.write() + tree = repo.index.write_tree() + + # Determine base commit; if repo.head.target raises GitError, + # we have no current commits + try: + base = [repo.head.target] + except pygit2.GitError: + base = [] + + utcnow = datetime.utcnow() + author = pygit2.Signature( + config.get("git-archive", "author"), + config.get("git-archive", "author-email"), + int(utcnow.timestamp()), + 0, + ) + + # Commit the changes + timestamp = utcnow.strftime("%Y-%m-%d %H:%M:%S") + title = f"update - {timestamp}" + repo.create_commit(REF, author, author, title, tree, base) + + print("committed changes") + else: + print("no diff detected") + + +def main() -> int: + args = parse_args() + + print(f"loading '{args.spec}' spec") + spec_package = "aurweb.archives.spec" + module_path = f"{spec_package}.{args.spec}" + spec_module = importlib.import_module(module_path) + print(f"loaded '{args.spec}'") + + # Track repositories that the spec modifies. After we run + # through specs, we want to make a single commit for all + # repositories that contain changes. + repos = dict() + + print(f"running '{args.spec}' spec...") + spec = spec_module.Spec() + for output in spec.generate(): + if not os.path.exists(output.git_info.path / ".git"): + init_repository(output.git_info) + + path = output.git_info.path / output.filename + with open(path, "wb") as f: + f.write(output.data) + + if output.git_info.path not in repos: + repos[output.git_info.path] = pygit2.Repository(output.git_info.path) + + print(f"done running '{args.spec}' spec") + + print("processing repositories") + for path in spec.repos: + print(f"processing repository: {path}") + update_repository(pygit2.Repository(path)) + + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + sys.exit(0) + except Exception: + traceback.print_exc() + sys.exit(1) diff --git a/aurweb/scripts/mkpkglists.py b/aurweb/scripts/mkpkglists.py index 7ca171ab..bfdd12b4 100755 --- a/aurweb/scripts/mkpkglists.py +++ b/aurweb/scripts/mkpkglists.py @@ -188,6 +188,7 @@ def _main(): USERS = aurweb.config.get("mkpkglists", "userfile") bench = Benchmark() + logger.warning(f"{sys.argv[0]} is deprecated and will be soon be removed") logger.info("Started re-creating archives, wait a while...") query = ( diff --git a/aurweb/scripts/popupdate.py b/aurweb/scripts/popupdate.py index aa163be1..83506e22 100755 --- a/aurweb/scripts/popupdate.py +++ b/aurweb/scripts/popupdate.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 +from datetime import datetime from sqlalchemy import and_, func from sqlalchemy.sql.functions import coalesce, sum as _sum -from aurweb import db, time +from aurweb import config, db, time from aurweb.models import PackageBase, PackageVote @@ -46,13 +47,24 @@ def run_variable(pkgbases: list[PackageBase] = []) -> None: ids = set() if pkgbases: + # If `pkgbases` were given, we should forcefully update the given + # package base records' popularities. ids = {pkgbase.ID for pkgbase in pkgbases} query = query.filter(PackageBase.ID.in_(ids)) + else: + # Otherwise, we should only update popularities which have exceeded + # the popularity interval length. + interval = config.getint("git-archive", "popularity-interval") + query = query.filter( + PackageBase.PopularityUpdated + <= datetime.fromtimestamp((now - interval)) + ) query.update( { "NumVotes": votes_subq.scalar_subquery(), "Popularity": pop_subq.scalar_subquery(), + "PopularityUpdated": datetime.fromtimestamp(now), } ) diff --git a/aurweb/testing/git.py b/aurweb/testing/git.py index 216515c8..39af87de 100644 --- a/aurweb/testing/git.py +++ b/aurweb/testing/git.py @@ -1,6 +1,4 @@ import os -import shlex -from subprocess import PIPE, Popen from typing import Tuple import py @@ -8,6 +6,7 @@ import py from aurweb.models import Package from aurweb.templates import base_template from aurweb.testing.filelock import FileLock +from aurweb.util import shell_exec class GitRepository: @@ -24,10 +23,7 @@ class GitRepository: self.file_lock.lock(on_create=self._setup) def _exec(self, cmdline: str, cwd: str) -> Tuple[int, str, str]: - args = shlex.split(cmdline) - proc = Popen(args, cwd=cwd, stdout=PIPE, stderr=PIPE) - out, err = proc.communicate() - return (proc.returncode, out.decode().strip(), err.decode().strip()) + return shell_exec(cmdline, cwd) def _exec_repository(self, cmdline: str) -> Tuple[int, str, str]: return self._exec(cmdline, cwd=str(self.file_lock.path)) diff --git a/aurweb/util.py b/aurweb/util.py index 4f1bd64e..432b818a 100644 --- a/aurweb/util.py +++ b/aurweb/util.py @@ -1,6 +1,7 @@ import math import re import secrets +import shlex import string from datetime import datetime from http import HTTPStatus @@ -192,3 +193,10 @@ def parse_ssh_key(string: str) -> Tuple[str, str]: def parse_ssh_keys(string: str) -> list[Tuple[str, str]]: """Parse a list of SSH public keys.""" return [parse_ssh_key(e) for e in string.splitlines()] + + +def shell_exec(cmdline: str, cwd: str) -> Tuple[int, str, str]: + args = shlex.split(cmdline) + proc = Popen(args, cwd=cwd, stdout=PIPE, stderr=PIPE) + out, err = proc.communicate() + return (proc.returncode, out.decode().strip(), err.decode().strip()) diff --git a/conf/config.defaults b/conf/config.defaults index 722802cc..6cdffe65 100644 --- a/conf/config.defaults +++ b/conf/config.defaults @@ -131,6 +131,18 @@ packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz userfile = /srv/http/aurweb/web/html/users.gz +[git-archive] +author = git_archive.py +author-email = no-reply@archlinux.org + +; One week worth of seconds (86400 * 7) +popularity-interval = 604800 + +metadata-repo = /srv/http/aurweb/metadata.git +users-repo = /srv/http/aurweb/users.git +pkgbases-repo = /srv/http/aurweb/pkgbases.git +pkgnames-repo = /srv/http/aurweb/pkgnames.git + [devel] ; commit_url is a format string used to produce a link to a commit hash. commit_url = https://gitlab.archlinux.org/archlinux/aurweb/-/commits/%s diff --git a/conf/config.dev b/conf/config.dev index 923c34ff..b36bfe77 100644 --- a/conf/config.dev +++ b/conf/config.dev @@ -76,5 +76,11 @@ packagesmetaextfile = /var/lib/aurweb/archives/packages-meta-ext-v1.json.gz pkgbasefile = /var/lib/aurweb/archives/pkgbase.gz userfile = /var/lib/aurweb/archives/users.gz +[git-archive] +metadata-repo = metadata.git +users-repo = users.git +pkgbases-repo = pkgbases.git +pkgnames-repo = pkgnames.git + [aurblup] db-path = YOUR_AUR_ROOT/aurblup/ diff --git a/doc/git-archive.md b/doc/git-archive.md new file mode 100644 index 00000000..cbc148b9 --- /dev/null +++ b/doc/git-archive.md @@ -0,0 +1,75 @@ +# aurweb Git Archive Specification + + + WARNING: This aurweb Git Archive implementation is + experimental and may be changed. + + +## Overview + +This git archive specification refers to the archive git repositories +created by [aurweb/scripts/git_archive.py](aurweb/scripts/git_archive.py) +using [spec modules](#spec-modules). + +## Configuration + +- `[git-archive]` + - `author` + - Git commit author + - `author-email` + - Git commit author email + +See an [official spec](#official-specs)'s documentation for spec-specific +configurations. + +## Fetch/Update Archives + +When a client has not yet fetched any initial archives, they should clone +the repository: + + $ git clone https://aur.archlinux.org/archive.git aurweb-archive + +When updating, the repository is already cloned and changes need to be pulled +from remote: + + # To update: + $ cd aurweb-archive && git pull + +For end-user production applications, see +[Minimize Disk Space](#minimize-disk-space). + +## Minimize Disk Space + +Using `git gc` on the repository will compress revisions and remove +unreachable objects which grow the repository a considerable amount +each commit. It is recommended that the following command is used +after cloning the archive or pulling updates: + + $ cd aurweb-archive && git gc --aggressive + +## Spec Modules + +Each aurweb spec module belongs to the `aurweb.archives.spec` package. For +example: a spec named "example" would be located at +`aurweb.archives.spec.example`. + +[Official spec listings](#official-specs) use the following format: + +- `spec_name` + - Spec description; what this spec produces + - `` + +### Official Specs + +- [metadata](doc/specs/metadata.md) + - Package RPC `type=info` metadata + - [metadata-repo](repos/metadata-repo.md) +- [users](doc/specs/users.md) + - List of users found in the database + - [users-repo](repos/users-repo.md) +- [pkgbases](doc/specs/pkgbases.md) + - List of package bases found in the database + - [pkgbases-repo](repos/pkgbases-repo.md) +- [pkgnames](doc/specs/pkgnames.md) + - List of package names found in the database + - [pkgnames-repo](repos/pkgnames-repo.md) diff --git a/doc/maintenance.txt b/doc/maintenance.txt index c52cf76f..56616f79 100644 --- a/doc/maintenance.txt +++ b/doc/maintenance.txt @@ -70,20 +70,48 @@ computations and clean up the database: * aurweb-pkgmaint automatically removes empty repositories that were created within the last 24 hours but never populated. -* aurweb-mkpkglists generates the package list files; it takes an optional - --extended flag, which additionally produces multiinfo metadata. It also - generates {archive.gz}.sha256 files that should be located within +* [Deprecated] aurweb-mkpkglists generates the package list files; it takes + an optional --extended flag, which additionally produces multiinfo metadata. + It also generates {archive.gz}.sha256 files that should be located within mkpkglists.archivedir which contain a SHA-256 hash of their matching .gz counterpart. * aurweb-usermaint removes the last login IP address of all users that did not login within the past seven days. +* aurweb-git-archive generates Git repository archives based on a --spec. + This script is a new generation of aurweb-mkpkglists, which creates and + maintains Git repository versions of the archives produced by + aurweb-mkpkglists. See doc/git-archive.md for detailed documentation. + These scripts can be installed by running `poetry install` and are usually scheduled using Cron. The current setup is: ---- -*/5 * * * * poetry run aurweb-mkpkglists [--extended] +# Run aurweb-git-archive --spec metadata directly after +# aurweb-mkpkglists so that they are executed sequentially, since +# both scripts are quite heavy. `aurweb-mkpkglists` should be removed +# from here once its deprecation period has ended. +*/5 * * * * poetry run aurweb-mkpkglists [--extended] && poetry run aurweb-git-archive --spec metadata + +# Update popularity once an hour. This is done to reduce the amount +# of changes caused by popularity data. Even if a package is otherwise +# unchanged, popularity is recalculated every 5 minutes via aurweb-popupdate, +# which causes changes for a large chunk of packages. +# +# At this interval, clients can still take advantage of popularity +# data, but its updates are guarded behind hour-long intervals. +*/60 * * * * poetry run aurweb-git-archive --spec popularity + +# Usernames +*/5 * * * * poetry run aurweb-git-archive --spec users + +# Package base names +*/5 * * * * poetry run aurweb-git-archive --spec pkgbases + +# Package names +*/5 * * * * poetry run aurweb-git-archive --spec pkgnames + 1 */2 * * * poetry run aurweb-popupdate 2 */2 * * * poetry run aurweb-aurblup 3 */2 * * * poetry run aurweb-pkgmaint diff --git a/doc/repos/metadata-repo.md b/doc/repos/metadata-repo.md new file mode 100644 index 00000000..cc678f40 --- /dev/null +++ b/doc/repos/metadata-repo.md @@ -0,0 +1,121 @@ +# Repository: metadata-repo + +## Overview + +The resulting repository contains RPC `type=info` JSON data for packages, +split into two different files: + +- `pkgbase.json` contains details about each package base in the AUR +- `pkgname.json` contains details about each package in the AUR + +See [Data](#data) for a breakdown of how data is presented in this +repository based off of a RPC `type=info` base. + +See [File Layout](#file-layout) for a detailed summary of the layout +of these files and the data contained within. + +**NOTE: `Popularity` now requires a client-side calculation, see [Popularity Calculation](#popularity-calculation).** + +## Data + +This repository contains RPC `type=info` data for all packages found +in AUR's database, reorganized to be suitable for Git repository +changes. + +- `pkgname.json` holds Package-specific metadata + - Some fields have been removed from `pkgname.json` objects + - `ID` + - `PackageBaseID -> ID` (moved to `pkgbase.json`) + - `NumVotes` (moved to `pkgbase.json`) + - `Popularity` (moved to `pkgbase.json`) +- `pkgbase.json` holds PackageBase-specific metadata + - Package Base fields from `pkgname.json` have been moved over to + `pkgbase.json` + - `ID` + - `Keywords` + - `FirstSubmitted` + - `LastModified` + - `OutOfDate` + - `Maintainer` + - `URLPath` + - `NumVotes` + - `Popularity` + - `PopularityUpdated` + +## Popularity Calculation + +Clients intending to use popularity data from this archive **must** +perform a decay calculation on their end to reflect a close approximation +of up-to-date popularity. + +Putting this step onto the client allows the server to maintain +less popularity record updates, dramatically improving archiving +of popularity data. The same calculation is done on the server-side +when producing outputs for RPC `type=info` and package pages. + +``` +Let T = Current UTC timestamp in seconds +Let PU = PopularityUpdated timestamp in seconds + +# The delta between now and PU in days +Let D = (T - PU) / 86400 + +# Calculate up-to-date popularity: +P = Popularity * (0.98^D) +``` + +We can see that the resulting up-to-date popularity value decays as +the exponent is increased: +- `1.0 * (0.98^1) = 0.98` +- `1.0 * (0.98^2) = 0.96039999` +- ... + +This decay calculation is essentially pushing back the date found for +votes by the exponent, which takes into account the time-factor. However, +since this calculation is based off of decimals and exponents, it +eventually becomes imprecise. The AUR updates these records on a forced +interval and whenever a vote is added to or removed from a particular package +to avoid imprecision from being an issue for clients + +## File Layout + +#### pkgbase.json: + + { + "pkgbase1": { + "FirstSubmitted": 123456, + "ID": 1, + "LastModified": 123456, + "Maintainer": "kevr", + "OutOfDate": null, + "URLPath": "/cgit/aur.git/snapshot/pkgbase1.tar.gz", + "NumVotes": 1, + "Popularity": 1.0, + "PopularityUpdated": 12345567753.0 + }, + ... + } + +#### pkgname.json: + + { + "pkg1": { + "CheckDepends": [], # Only included if a check dependency exists + "Conflicts": [], # Only included if a conflict exists + "Depends": [], # Only included if a dependency exists + "Description": "some description", + "Groups": [], # Only included if a group exists + "ID": 1, + "Keywords": [], + "License": [], + "MakeDepends": [], # Only included if a make dependency exists + "Name": "pkg1", + "OptDepends": [], # Only included if an opt dependency exists + "PackageBase": "pkgbase1", + "Provides": [], # Only included if `provides` is defined + "Replaces": [], # Only included if `replaces` is defined + "URL": "https://some_url.com", + "Version": "1.0-1" + }, + ... + } diff --git a/doc/repos/pkgbases-repo.md b/doc/repos/pkgbases-repo.md new file mode 100644 index 00000000..f4cb896f --- /dev/null +++ b/doc/repos/pkgbases-repo.md @@ -0,0 +1,15 @@ +# Repository: pkgbases-repo + +## Overview + +- `pkgbase.json` contains a list of package base names + +## File Layout + +### pkgbase.json: + + [ + "pkgbase1", + "pkgbase2", + ... + ] diff --git a/doc/repos/pkgnames-repo.md b/doc/repos/pkgnames-repo.md new file mode 100644 index 00000000..ae6fb4ed --- /dev/null +++ b/doc/repos/pkgnames-repo.md @@ -0,0 +1,15 @@ +# Repository: pkgnames-repo + +## Overview + +- `pkgname.json` contains a list of package names + +## File Layout + +### pkgname.json: + + [ + "pkgname1", + "pkgname2", + ... + ] diff --git a/doc/repos/users-repo.md b/doc/repos/users-repo.md new file mode 100644 index 00000000..23db9cfb --- /dev/null +++ b/doc/repos/users-repo.md @@ -0,0 +1,15 @@ +# Repository: users-repo + +## Overview + +- `users.json` contains a list of usernames + +## File Layout + +### users.json: + + [ + "user1", + "user2", + ... + ] diff --git a/doc/specs/metadata.md b/doc/specs/metadata.md new file mode 100644 index 00000000..282c0dd5 --- /dev/null +++ b/doc/specs/metadata.md @@ -0,0 +1,14 @@ +# Git Archive Spec: metadata + +## Configuration + +- `[git-archive]` + - `metadata-repo` + - Path to package metadata git repository location + +## Repositories + +For documentation on each one of these repositories, follow their link, +which brings you to a topical markdown for that repository. + +- [metadata-repo](doc/repos/metadata-repo.md) diff --git a/doc/specs/pkgbases.md b/doc/specs/pkgbases.md new file mode 100644 index 00000000..80279070 --- /dev/null +++ b/doc/specs/pkgbases.md @@ -0,0 +1,14 @@ +# Git Archive Spec: pkgbases + +## Configuration + +- `[git-archive]` + - `pkgbases-repo` + - Path to pkgbases git repository location + +## Repositories + +For documentation on each one of these repositories, follow their link, +which brings you to a topical markdown for that repository. + +- [pkgbases-repo](doc/repos/pkgbases-repo.md) diff --git a/doc/specs/pkgnames.md b/doc/specs/pkgnames.md new file mode 100644 index 00000000..0a4a907d --- /dev/null +++ b/doc/specs/pkgnames.md @@ -0,0 +1,14 @@ +# Git Archive Spec: pkgnames + +## Configuration + +- `[git-archive]` + - `pkgnames-repo` + - Path to pkgnames git repository location + +## Repositories + +For documentation on each one of these repositories, follow their link, +which brings you to a topical markdown for that repository. + +- [pkgnames-repo](doc/repos/pkgnames-repo.md) diff --git a/doc/specs/popularity.md b/doc/specs/popularity.md new file mode 100644 index 00000000..3084f458 --- /dev/null +++ b/doc/specs/popularity.md @@ -0,0 +1,14 @@ +# Git Archive Spec: popularity + +## Configuration + +- `[git-archive]` + - `popularity-repo` + - Path to popularity git repository location + +## Repositories + +For documentation on each one of these repositories, follow their link, +which brings you to a topical markdown for that repository. + +- [popularity-repo](doc/repos/popularity-repo.md) diff --git a/doc/specs/users.md b/doc/specs/users.md new file mode 100644 index 00000000..25396154 --- /dev/null +++ b/doc/specs/users.md @@ -0,0 +1,14 @@ +# Git Archive Spec: users + +## Configuration + +- `[git-archive]` + - `users-repo` + - Path to users git repository location + +## Repositories + +For documentation on each one of these repositories, follow their link, +which brings you to a topical markdown for that repository. + +- [users-repo](doc/repos/users-repo.md) diff --git a/migrations/versions/6441d3b65270_add_popularityupdated_to_packagebase.py b/migrations/versions/6441d3b65270_add_popularityupdated_to_packagebase.py new file mode 100644 index 00000000..afa87687 --- /dev/null +++ b/migrations/versions/6441d3b65270_add_popularityupdated_to_packagebase.py @@ -0,0 +1,33 @@ +"""add PopularityUpdated to PackageBase + +Revision ID: 6441d3b65270 +Revises: d64e5571bc8d +Create Date: 2022-09-22 18:08:03.280664 + +""" +from alembic import op +from sqlalchemy.exc import OperationalError + +from aurweb.models.package_base import PackageBase +from aurweb.scripts import popupdate + +# revision identifiers, used by Alembic. +revision = "6441d3b65270" +down_revision = "d64e5571bc8d" +branch_labels = None +depends_on = None + +table = PackageBase.__table__ + + +def upgrade(): + try: + op.add_column(table.name, table.c.PopularityUpdated) + except OperationalError: + print(f"table '{table.name}' already exists, skipping migration") + + popupdate.run_variable() + + +def downgrade(): + op.drop_column(table.name, "PopularityUpdated") diff --git a/pyproject.toml b/pyproject.toml index f732f2e7..775ece09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,3 +117,4 @@ aurweb-tuvotereminder = "aurweb.scripts.tuvotereminder:main" aurweb-usermaint = "aurweb.scripts.usermaint:main" aurweb-config = "aurweb.scripts.config:main" aurweb-adduser = "aurweb.scripts.adduser:main" +aurweb-git-archive = "aurweb.scripts.git_archive:main" diff --git a/templates/partials/packages/details.html b/templates/partials/packages/details.html index 86bc1de5..8ecf9bd8 100644 --- a/templates/partials/packages/details.html +++ b/templates/partials/packages/details.html @@ -149,7 +149,7 @@ {{ "Popularity" | tr }}: - {{ pkgbase.Popularity | number_format(6 if pkgbase.Popularity <= 0.2 else 2) }} + {{ popularity | number_format(6 if popularity <= 0.2 else 2) }} {{ "First Submitted" | tr }}: diff --git a/test/test_git_archives.py b/test/test_git_archives.py new file mode 100644 index 00000000..8ee4c2ba --- /dev/null +++ b/test/test_git_archives.py @@ -0,0 +1,241 @@ +from http import HTTPStatus +from typing import Tuple +from unittest import mock + +import py +import pygit2 +import pytest +from fastapi.testclient import TestClient + +from aurweb import asgi, config, db +from aurweb.archives.spec.base import GitInfo, SpecBase +from aurweb.models import Package, PackageBase, User +from aurweb.scripts import git_archive +from aurweb.testing.requests import Request + + +@pytest.fixture +def mock_metadata_archive( + tmp_path: py.path.local, +) -> Tuple[py.path.local, py.path.local]: + metadata_path = tmp_path / "metadata.git" + + get_ = config.get + + def mock_config(section: str, option: str) -> str: + if section == "git-archive": + if option == "metadata-repo": + return str(metadata_path) + return get_(section, option) + + with mock.patch("aurweb.config.get", side_effect=mock_config): + yield metadata_path + + +@pytest.fixture +def mock_users_archive(tmp_path: py.path.local) -> py.path.local: + users_path = tmp_path / "users.git" + + get_ = config.get + + def mock_config(section: str, option: str) -> str: + if section == "git-archive": + if option == "users-repo": + return str(users_path) + return get_(section, option) + + with mock.patch("aurweb.config.get", side_effect=mock_config): + yield users_path + + +@pytest.fixture +def mock_pkgbases_archive(tmp_path: py.path.local) -> py.path.local: + pkgbases_path = tmp_path / "pkgbases.git" + + get_ = config.get + + def mock_config(section: str, option: str) -> str: + if section == "git-archive": + if option == "pkgbases-repo": + return str(pkgbases_path) + return get_(section, option) + + with mock.patch("aurweb.config.get", side_effect=mock_config): + yield pkgbases_path + + +@pytest.fixture +def mock_pkgnames_archive(tmp_path: py.path.local) -> py.path.local: + pkgnames_path = tmp_path / "pkgnames.git" + + get_ = config.get + + def mock_config(section: str, option: str) -> str: + if section == "git-archive": + if option == "pkgnames-repo": + return str(pkgnames_path) + return get_(section, option) + + with mock.patch("aurweb.config.get", side_effect=mock_config): + yield pkgnames_path + + +@pytest.fixture +def metadata(mock_metadata_archive: py.path.local) -> py.path.local: + args = [__name__, "--spec", "metadata"] + with mock.patch("sys.argv", args): + yield mock_metadata_archive + + +@pytest.fixture +def users(mock_users_archive: py.path.local) -> py.path.local: + args = [__name__, "--spec", "users"] + with mock.patch("sys.argv", args): + yield mock_users_archive + + +@pytest.fixture +def pkgbases(mock_pkgbases_archive: py.path.local) -> py.path.local: + args = [__name__, "--spec", "pkgbases"] + with mock.patch("sys.argv", args): + yield mock_pkgbases_archive + + +@pytest.fixture +def pkgnames(mock_pkgnames_archive: py.path.local) -> py.path.local: + args = [__name__, "--spec", "pkgnames"] + with mock.patch("sys.argv", args): + yield mock_pkgnames_archive + + +@pytest.fixture +def client() -> TestClient: + yield TestClient(app=asgi.app) + + +@pytest.fixture +def user(db_test: None) -> User: + with db.begin(): + user_ = db.create( + User, + Username="test", + Email="test@example.org", + Passwd="testPassword", + ) + + yield user_ + + +@pytest.fixture +def package(user: User) -> Package: + with db.begin(): + pkgbase_ = db.create( + PackageBase, + Name="test", + Maintainer=user, + Packager=user, + ) + + pkg_ = db.create( + Package, + PackageBase=pkgbase_, + Name="test", + ) + + yield pkg_ + + +def commit_count(repo: pygit2.Repository) -> int: + commits = 0 + for _ in repo.walk(repo.head.target): + commits += 1 + return commits + + +def test_specbase_raises_notimplementederror(): + spec = SpecBase() + with pytest.raises(NotImplementedError): + spec.generate() + + +def test_gitinfo_config(tmpdir: py.path.local): + path = tmpdir / "test.git" + git_info = GitInfo(path, {"user.name": "Test Person"}) + git_archive.init_repository(git_info) + + repo = pygit2.Repository(path) + assert repo.config["user.name"] == "Test Person" + + +def test_metadata(metadata: py.path.local, package: Package): + # Run main(), which creates mock_metadata_archive and commits current + # package data to it, exercising the "diff detected, committing" path + assert git_archive.main() == 0 + repo = pygit2.Repository(metadata) + assert commit_count(repo) == 1 + + # Run main() again to exercise the "no diff detected" path + assert git_archive.main() == 0 + repo = pygit2.Repository(metadata) + assert commit_count(repo) == 1 + + +def test_metadata_change( + client: TestClient, metadata: py.path.local, user: User, package: Package +): + """Test that metadata changes via aurweb cause git_archive to produce diffs.""" + # Run main(), which creates mock_metadata_archive and commits current + # package data to it, exercising the "diff detected, committing" path + assert git_archive.main() == 0 + repo = pygit2.Repository(metadata) + assert commit_count(repo) == 1 + + # Now, we modify `package`-related metadata via aurweb POST. + pkgbasename = package.PackageBase.Name + cookies = {"AURSID": user.login(Request(), "testPassword")} + + with client as request: + endp = f"/pkgbase/{pkgbasename}/keywords" + post_data = {"keywords": "abc def"} + resp = request.post(endp, data=post_data, cookies=cookies, allow_redirects=True) + assert resp.status_code == HTTPStatus.OK + + # Run main() again, which should now produce a new commit with the + # keyword changes we just made + assert git_archive.main() == 0 + repo = pygit2.Repository(metadata) + assert commit_count(repo) == 2 + + +def test_metadata_delete(client: TestClient, metadata: py.path.local, package: Package): + # Run main(), which creates mock_metadata_archive and commits current + # package data to it, exercising the "diff detected, committing" path + assert git_archive.main() == 0 + repo = pygit2.Repository(metadata) + assert commit_count(repo) == 1 + + with db.begin(): + db.delete(package) + + # The deletion here should have caused a diff to be produced in git + assert git_archive.main() == 0 + repo = pygit2.Repository(metadata) + assert commit_count(repo) == 2 + + +def test_users(users: py.path.local, user: User): + assert git_archive.main() == 0 + repo = pygit2.Repository(users) + assert commit_count(repo) == 1 + + +def test_pkgbases(pkgbases: py.path.local, package: Package): + assert git_archive.main() == 0 + repo = pygit2.Repository(pkgbases) + assert commit_count(repo) == 1 + + +def test_pkgnames(pkgnames: py.path.local, package: Package): + assert git_archive.main() == 0 + repo = pygit2.Repository(pkgnames) + assert commit_count(repo) == 1 diff --git a/test/test_templates.py b/test/test_templates.py index f80e68eb..2ff31fc9 100644 --- a/test/test_templates.py +++ b/test/test_templates.py @@ -9,6 +9,7 @@ from aurweb.filters import as_timezone, number_format, timestamp_to_datetime as from aurweb.models import Package, PackageBase, User from aurweb.models.account_type import USER_ID from aurweb.models.license import License +from aurweb.models.package_base import popularity from aurweb.models.package_license import PackageLicense from aurweb.models.package_relation import PackageRelation from aurweb.models.relation_type import PROVIDES_ID, REPLACES_ID @@ -287,12 +288,14 @@ def test_package_details(user: User, package: Package): """Test package details with most fields populated, but not all.""" request = Request(user=user, authenticated=True) context = make_context(request, "Test Details") + context.update( { "request": request, "git_clone_uri_anon": GIT_CLONE_URI_ANON, "git_clone_uri_priv": GIT_CLONE_URI_PRIV, "pkgbase": package.PackageBase, + "popularity": popularity(package.PackageBase, time.utcnow()), "package": package, "comaintainers": [], } @@ -329,6 +332,7 @@ def test_package_details_filled(user: User, package: Package): "git_clone_uri_anon": GIT_CLONE_URI_ANON, "git_clone_uri_priv": GIT_CLONE_URI_PRIV, "pkgbase": package.PackageBase, + "popularity": popularity(package.PackageBase, time.utcnow()), "package": package, "comaintainers": [], "licenses": package.package_licenses,