feat: archive git repository (experimental)

See doc/git-archive.md for general Git archive specifications
See doc/repos/metadata-repo.md for info and direction related to the new Git metadata archive
This commit is contained in:
Kevin Morris 2022-09-24 16:51:25 +00:00
parent ec3152014b
commit 30e72d2db5
34 changed files with 1104 additions and 50 deletions

View file

@ -0,0 +1 @@
# aurweb.archives

View file

@ -0,0 +1 @@
# aurweb.archives.spec

View file

@ -0,0 +1,77 @@
from pathlib import Path
from typing import Any, Dict, Iterable, List, Set
class GitInfo:
"""Information about a Git repository."""
""" Path to Git repository. """
path: str
""" Local Git repository configuration. """
config: Dict[str, Any]
def __init__(self, path: str, config: Dict[str, Any] = dict()) -> "GitInfo":
self.path = Path(path)
self.config = config
class SpecOutput:
"""Class used for git_archive.py output details."""
""" Filename relative to the Git repository root. """
filename: Path
""" Git repository information. """
git_info: GitInfo
""" Bytes bound for `SpecOutput.filename`. """
data: bytes
def __init__(self, filename: str, git_info: GitInfo, data: bytes) -> "SpecOutput":
self.filename = filename
self.git_info = git_info
self.data = data
class SpecBase:
"""
Base for Spec classes defined in git_archve.py --spec modules.
All supported --spec modules must contain the following classes:
- Spec(SpecBase)
"""
""" A list of SpecOutputs, each of which contain output file data. """
outputs: List[SpecOutput] = list()
""" A set of repositories to commit changes to. """
repos: Set[str] = set()
def generate(self) -> Iterable[SpecOutput]:
"""
"Pure virtual" output generator.
`SpecBase.outputs` and `SpecBase.repos` should be populated within an
overridden version of this function in SpecBase derivatives.
"""
raise NotImplementedError()
def add_output(self, filename: str, git_info: GitInfo, data: bytes) -> None:
"""
Add a SpecOutput instance to the set of outputs.
:param filename: Filename relative to the git repository root
:param git_info: GitInfo instance
:param data: Binary data bound for `filename`
"""
if git_info.path not in self.repos:
self.repos.add(git_info.path)
self.outputs.append(
SpecOutput(
filename,
git_info,
data,
)
)

View file

@ -0,0 +1,85 @@
from typing import Iterable
import orjson
from aurweb import config, db
from aurweb.models import Package, PackageBase, User
from aurweb.rpc import RPC
from .base import GitInfo, SpecBase, SpecOutput
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
class Spec(SpecBase):
def __init__(self) -> "Spec":
self.metadata_repo = GitInfo(
config.get("git-archive", "metadata-repo"),
)
def generate(self) -> Iterable[SpecOutput]:
# Base query used by the RPC.
base_query = (
db.query(Package)
.join(PackageBase)
.join(User, PackageBase.MaintainerUID == User.ID)
)
# Create an instance of RPC, use it to get entities from
# our query and perform a metadata subquery for all packages.
rpc = RPC(version=5, type="info")
print("performing package database query")
packages = rpc.entities(base_query).all()
print("performing package database subqueries")
rpc.subquery({pkg.ID for pkg in packages})
pkgbases, pkgnames = dict(), dict()
for package in packages:
# Produce RPC type=info data for `package`
data = rpc.get_info_json_data(package)
pkgbase_name = data.get("PackageBase")
pkgbase_data = {
"ID": data.pop("PackageBaseID"),
"URLPath": data.pop("URLPath"),
"FirstSubmitted": data.pop("FirstSubmitted"),
"LastModified": data.pop("LastModified"),
"OutOfDate": data.pop("OutOfDate"),
"Maintainer": data.pop("Maintainer"),
"Keywords": data.pop("Keywords"),
"NumVotes": data.pop("NumVotes"),
"Popularity": data.pop("Popularity"),
"PopularityUpdated": package.PopularityUpdated.timestamp(),
}
# Store the data in `pkgbases` dict. We do this so we only
# end up processing a single `pkgbase` if repeated after
# this loop
pkgbases[pkgbase_name] = pkgbase_data
# Remove Popularity and NumVotes from package data.
# These fields change quite often which causes git data
# modification to explode.
# data.pop("NumVotes")
# data.pop("Popularity")
# Remove the ID key from package json.
data.pop("ID")
# Add the `package`.Name to the pkgnames set
name = data.get("Name")
pkgnames[name] = data
# Add metadata outputs
self.add_output(
"pkgname.json",
self.metadata_repo,
orjson.dumps(pkgnames, option=ORJSON_OPTS),
)
self.add_output(
"pkgbase.json",
self.metadata_repo,
orjson.dumps(pkgbases, option=ORJSON_OPTS),
)
return self.outputs

View file

@ -0,0 +1,32 @@
from typing import Iterable
import orjson
from aurweb import config, db
from aurweb.models import PackageBase
from .base import GitInfo, SpecBase, SpecOutput
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
class Spec(SpecBase):
def __init__(self) -> "Spec":
self.pkgbases_repo = GitInfo(config.get("git-archive", "pkgbases-repo"))
def generate(self) -> Iterable[SpecOutput]:
filt = PackageBase.PackagerUID.isnot(None)
query = (
db.query(PackageBase.Name)
.filter(filt)
.order_by(PackageBase.Name.asc())
.all()
)
pkgbases = [pkgbase.Name for pkgbase in query]
self.add_output(
"pkgbase.json",
self.pkgbases_repo,
orjson.dumps(pkgbases, option=ORJSON_OPTS),
)
return self.outputs

View file

@ -0,0 +1,33 @@
from typing import Iterable
import orjson
from aurweb import config, db
from aurweb.models import Package, PackageBase
from .base import GitInfo, SpecBase, SpecOutput
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
class Spec(SpecBase):
def __init__(self) -> "Spec":
self.pkgnames_repo = GitInfo(config.get("git-archive", "pkgnames-repo"))
def generate(self) -> Iterable[SpecOutput]:
filt = PackageBase.PackagerUID.isnot(None)
query = (
db.query(Package.Name)
.join(PackageBase, PackageBase.ID == Package.PackageBaseID)
.filter(filt)
.order_by(Package.Name.asc())
.all()
)
pkgnames = [pkg.Name for pkg in query]
self.add_output(
"pkgname.json",
self.pkgnames_repo,
orjson.dumps(pkgnames, option=ORJSON_OPTS),
)
return self.outputs

View file

@ -0,0 +1,26 @@
from typing import Iterable
import orjson
from aurweb import config, db
from aurweb.models import User
from .base import GitInfo, SpecBase, SpecOutput
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
class Spec(SpecBase):
def __init__(self) -> "Spec":
self.users_repo = GitInfo(config.get("git-archive", "users-repo"))
def generate(self) -> Iterable[SpecOutput]:
query = db.query(User.Username).order_by(User.Username.asc()).all()
users = [user.Username for user in query]
self.add_output(
"users.json",
self.users_repo,
orjson.dumps(users, option=ORJSON_OPTS),
)
return self.outputs

View file

@ -64,3 +64,13 @@ class PackageBase(Base):
if key in PackageBase.TO_FLOAT and not isinstance(attr, float): if key in PackageBase.TO_FLOAT and not isinstance(attr, float):
return float(attr) return float(attr)
return attr return attr
def popularity_decay(pkgbase: PackageBase, utcnow: int):
"""Return the delta between now and the last time popularity was updated, in days"""
return int((utcnow - pkgbase.PopularityUpdated.timestamp()) / 86400)
def popularity(pkgbase: PackageBase, utcnow: int):
"""Return up-to-date popularity"""
return float(pkgbase.Popularity) * (0.98 ** popularity_decay(pkgbase, utcnow))

View file

@ -3,8 +3,9 @@ from typing import Any
from fastapi import Request from fastapi import Request
from sqlalchemy import and_ from sqlalchemy import and_
from aurweb import config, db, defaults, l10n, util from aurweb import config, db, defaults, l10n, time, util
from aurweb.models import PackageBase, User from aurweb.models import PackageBase, User
from aurweb.models.package_base import popularity
from aurweb.models.package_comaintainer import PackageComaintainer from aurweb.models.package_comaintainer import PackageComaintainer
from aurweb.models.package_comment import PackageComment from aurweb.models.package_comment import PackageComment
from aurweb.models.package_request import PENDING_ID, PackageRequest from aurweb.models.package_request import PENDING_ID, PackageRequest
@ -81,6 +82,8 @@ def make_context(
and_(PackageRequest.Status == PENDING_ID, PackageRequest.ClosedTS.is_(None)) and_(PackageRequest.Status == PENDING_ID, PackageRequest.ClosedTS.is_(None))
).count() ).count()
context["popularity"] = popularity(pkgbase, time.utcnow())
return context return context

View file

@ -6,9 +6,10 @@ from fastapi.responses import HTMLResponse
from sqlalchemy import and_, literal, orm from sqlalchemy import and_, literal, orm
import aurweb.config as config import aurweb.config as config
from aurweb import db, defaults, models from aurweb import db, defaults, models, time
from aurweb.exceptions import RPCError from aurweb.exceptions import RPCError
from aurweb.filters import number_format from aurweb.filters import number_format
from aurweb.models.package_base import popularity
from aurweb.packages.search import RPCSearch from aurweb.packages.search import RPCSearch
TYPE_MAPPING = { TYPE_MAPPING = {
@ -120,16 +121,15 @@ class RPC:
if not args: if not args:
raise RPCError("No request type/data specified.") raise RPCError("No request type/data specified.")
def _get_json_data(self, package: models.Package) -> dict[str, Any]: def get_json_data(self, package: models.Package) -> dict[str, Any]:
"""Produce dictionary data of one Package that can be JSON-serialized. """Produce dictionary data of one Package that can be JSON-serialized.
:param package: Package instance :param package: Package instance
:returns: JSON-serializable dictionary :returns: JSON-serializable dictionary
""" """
# Produce RPC API compatible Popularity: If zero, it's an integer # Normalize Popularity for RPC output to 6 decimal precision
# 0, otherwise, it's formatted to the 6th decimal place. pop = popularity(package, time.utcnow())
pop = package.Popularity
pop = 0 if not pop else float(number_format(pop, 6)) pop = 0 if not pop else float(number_format(pop, 6))
snapshot_uri = config.get("options", "snapshot_uri") snapshot_uri = config.get("options", "snapshot_uri")
@ -151,8 +151,8 @@ class RPC:
"LastModified": package.ModifiedTS, "LastModified": package.ModifiedTS,
} }
def _get_info_json_data(self, package: models.Package) -> dict[str, Any]: def get_info_json_data(self, package: models.Package) -> dict[str, Any]:
data = self._get_json_data(package) data = self.get_json_data(package)
# All info results have _at least_ an empty list of # All info results have _at least_ an empty list of
# License and Keywords. # License and Keywords.
@ -176,7 +176,7 @@ class RPC:
""" """
return [data_generator(pkg) for pkg in packages] return [data_generator(pkg) for pkg in packages]
def _entities(self, query: orm.Query) -> orm.Query: def entities(self, query: orm.Query) -> orm.Query:
"""Select specific RPC columns on `query`.""" """Select specific RPC columns on `query`."""
return query.with_entities( return query.with_entities(
models.Package.ID, models.Package.ID,
@ -188,38 +188,14 @@ class RPC:
models.PackageBase.Name.label("PackageBaseName"), models.PackageBase.Name.label("PackageBaseName"),
models.PackageBase.NumVotes, models.PackageBase.NumVotes,
models.PackageBase.Popularity, models.PackageBase.Popularity,
models.PackageBase.PopularityUpdated,
models.PackageBase.OutOfDateTS, models.PackageBase.OutOfDateTS,
models.PackageBase.SubmittedTS, models.PackageBase.SubmittedTS,
models.PackageBase.ModifiedTS, models.PackageBase.ModifiedTS,
models.User.Username.label("Maintainer"), models.User.Username.label("Maintainer"),
).group_by(models.Package.ID) ).group_by(models.Package.ID)
def _handle_multiinfo_type( def subquery(self, ids: set[int]):
self, args: list[str] = [], **kwargs
) -> list[dict[str, Any]]:
self._enforce_args(args)
args = set(args)
packages = (
db.query(models.Package)
.join(models.PackageBase)
.join(
models.User,
models.User.ID == models.PackageBase.MaintainerUID,
isouter=True,
)
.filter(models.Package.Name.in_(args))
)
max_results = config.getint("options", "max_rpc_results")
packages = self._entities(packages).limit(max_results + 1)
if packages.count() > max_results:
raise RPCError("Too many package results.")
ids = {pkg.ID for pkg in packages}
# Aliases for 80-width.
Package = models.Package Package = models.Package
PackageKeyword = models.PackageKeyword PackageKeyword = models.PackageKeyword
@ -311,7 +287,33 @@ class RPC:
self.extra_info[record.ID][type_].append(name) self.extra_info[record.ID][type_].append(name)
return self._assemble_json_data(packages, self._get_info_json_data) def _handle_multiinfo_type(
self, args: list[str] = [], **kwargs
) -> list[dict[str, Any]]:
self._enforce_args(args)
args = set(args)
packages = (
db.query(models.Package)
.join(models.PackageBase)
.join(
models.User,
models.User.ID == models.PackageBase.MaintainerUID,
isouter=True,
)
.filter(models.Package.Name.in_(args))
)
max_results = config.getint("options", "max_rpc_results")
packages = self.entities(packages).limit(max_results + 1)
if packages.count() > max_results:
raise RPCError("Too many package results.")
ids = {pkg.ID for pkg in packages}
self.subquery(ids)
return self._assemble_json_data(packages, self.get_info_json_data)
def _handle_search_type( def _handle_search_type(
self, by: str = defaults.RPC_SEARCH_BY, args: list[str] = [] self, by: str = defaults.RPC_SEARCH_BY, args: list[str] = []
@ -330,12 +332,12 @@ class RPC:
search.search_by(by, arg) search.search_by(by, arg)
max_results = config.getint("options", "max_rpc_results") max_results = config.getint("options", "max_rpc_results")
results = self._entities(search.results()).limit(max_results + 1).all() results = self.entities(search.results()).limit(max_results + 1).all()
if len(results) > max_results: if len(results) > max_results:
raise RPCError("Too many package results.") raise RPCError("Too many package results.")
return self._assemble_json_data(results, self._get_json_data) return self._assemble_json_data(results, self.get_json_data)
def _handle_msearch_type( def _handle_msearch_type(
self, args: list[str] = [], **kwargs self, args: list[str] = [], **kwargs

View file

@ -155,6 +155,12 @@ PackageBases = Table(
nullable=False, nullable=False,
server_default=text("0"), server_default=text("0"),
), ),
Column(
"PopularityUpdated",
TIMESTAMP,
nullable=False,
server_default=text("'1970-01-01 00:00:01.000000'"),
),
Column("OutOfDateTS", BIGINT(unsigned=True)), Column("OutOfDateTS", BIGINT(unsigned=True)),
Column("FlaggerComment", Text, nullable=False), Column("FlaggerComment", Text, nullable=False),
Column("SubmittedTS", BIGINT(unsigned=True), nullable=False), Column("SubmittedTS", BIGINT(unsigned=True), nullable=False),

View file

@ -0,0 +1,125 @@
import argparse
import importlib
import os
import sys
import traceback
from datetime import datetime
import orjson
import pygit2
from aurweb import config
# Constants
REF = "refs/heads/master"
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
def init_repository(git_info) -> None:
pygit2.init_repository(git_info.path)
repo = pygit2.Repository(git_info.path)
for k, v in git_info.config.items():
repo.config[k] = v
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--spec",
type=str,
required=True,
help="name of spec module in the aurweb.archives.spec package",
)
return parser.parse_args()
def update_repository(repo: pygit2.Repository):
# Use git status to determine file changes
has_changes = False
changes = repo.status()
for filepath, flags in changes.items():
if flags != pygit2.GIT_STATUS_CURRENT:
has_changes = True
break
if has_changes:
print("diff detected, committing")
# Add everything in the tree.
print("adding files to git tree")
# Add the tree to staging
repo.index.read()
repo.index.add_all()
repo.index.write()
tree = repo.index.write_tree()
# Determine base commit; if repo.head.target raises GitError,
# we have no current commits
try:
base = [repo.head.target]
except pygit2.GitError:
base = []
utcnow = datetime.utcnow()
author = pygit2.Signature(
config.get("git-archive", "author"),
config.get("git-archive", "author-email"),
int(utcnow.timestamp()),
0,
)
# Commit the changes
timestamp = utcnow.strftime("%Y-%m-%d %H:%M:%S")
title = f"update - {timestamp}"
repo.create_commit(REF, author, author, title, tree, base)
print("committed changes")
else:
print("no diff detected")
def main() -> int:
args = parse_args()
print(f"loading '{args.spec}' spec")
spec_package = "aurweb.archives.spec"
module_path = f"{spec_package}.{args.spec}"
spec_module = importlib.import_module(module_path)
print(f"loaded '{args.spec}'")
# Track repositories that the spec modifies. After we run
# through specs, we want to make a single commit for all
# repositories that contain changes.
repos = dict()
print(f"running '{args.spec}' spec...")
spec = spec_module.Spec()
for output in spec.generate():
if not os.path.exists(output.git_info.path / ".git"):
init_repository(output.git_info)
path = output.git_info.path / output.filename
with open(path, "wb") as f:
f.write(output.data)
if output.git_info.path not in repos:
repos[output.git_info.path] = pygit2.Repository(output.git_info.path)
print(f"done running '{args.spec}' spec")
print("processing repositories")
for path in spec.repos:
print(f"processing repository: {path}")
update_repository(pygit2.Repository(path))
return 0
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
sys.exit(0)
except Exception:
traceback.print_exc()
sys.exit(1)

View file

@ -188,6 +188,7 @@ def _main():
USERS = aurweb.config.get("mkpkglists", "userfile") USERS = aurweb.config.get("mkpkglists", "userfile")
bench = Benchmark() bench = Benchmark()
logger.warning(f"{sys.argv[0]} is deprecated and will be soon be removed")
logger.info("Started re-creating archives, wait a while...") logger.info("Started re-creating archives, wait a while...")
query = ( query = (

View file

@ -1,9 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from datetime import datetime
from sqlalchemy import and_, func from sqlalchemy import and_, func
from sqlalchemy.sql.functions import coalesce, sum as _sum from sqlalchemy.sql.functions import coalesce, sum as _sum
from aurweb import db, time from aurweb import config, db, time
from aurweb.models import PackageBase, PackageVote from aurweb.models import PackageBase, PackageVote
@ -46,13 +47,24 @@ def run_variable(pkgbases: list[PackageBase] = []) -> None:
ids = set() ids = set()
if pkgbases: if pkgbases:
# If `pkgbases` were given, we should forcefully update the given
# package base records' popularities.
ids = {pkgbase.ID for pkgbase in pkgbases} ids = {pkgbase.ID for pkgbase in pkgbases}
query = query.filter(PackageBase.ID.in_(ids)) query = query.filter(PackageBase.ID.in_(ids))
else:
# Otherwise, we should only update popularities which have exceeded
# the popularity interval length.
interval = config.getint("git-archive", "popularity-interval")
query = query.filter(
PackageBase.PopularityUpdated
<= datetime.fromtimestamp((now - interval))
)
query.update( query.update(
{ {
"NumVotes": votes_subq.scalar_subquery(), "NumVotes": votes_subq.scalar_subquery(),
"Popularity": pop_subq.scalar_subquery(), "Popularity": pop_subq.scalar_subquery(),
"PopularityUpdated": datetime.fromtimestamp(now),
} }
) )

View file

@ -1,6 +1,4 @@
import os import os
import shlex
from subprocess import PIPE, Popen
from typing import Tuple from typing import Tuple
import py import py
@ -8,6 +6,7 @@ import py
from aurweb.models import Package from aurweb.models import Package
from aurweb.templates import base_template from aurweb.templates import base_template
from aurweb.testing.filelock import FileLock from aurweb.testing.filelock import FileLock
from aurweb.util import shell_exec
class GitRepository: class GitRepository:
@ -24,10 +23,7 @@ class GitRepository:
self.file_lock.lock(on_create=self._setup) self.file_lock.lock(on_create=self._setup)
def _exec(self, cmdline: str, cwd: str) -> Tuple[int, str, str]: def _exec(self, cmdline: str, cwd: str) -> Tuple[int, str, str]:
args = shlex.split(cmdline) return shell_exec(cmdline, cwd)
proc = Popen(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
out, err = proc.communicate()
return (proc.returncode, out.decode().strip(), err.decode().strip())
def _exec_repository(self, cmdline: str) -> Tuple[int, str, str]: def _exec_repository(self, cmdline: str) -> Tuple[int, str, str]:
return self._exec(cmdline, cwd=str(self.file_lock.path)) return self._exec(cmdline, cwd=str(self.file_lock.path))

View file

@ -1,6 +1,7 @@
import math import math
import re import re
import secrets import secrets
import shlex
import string import string
from datetime import datetime from datetime import datetime
from http import HTTPStatus from http import HTTPStatus
@ -192,3 +193,10 @@ def parse_ssh_key(string: str) -> Tuple[str, str]:
def parse_ssh_keys(string: str) -> list[Tuple[str, str]]: def parse_ssh_keys(string: str) -> list[Tuple[str, str]]:
"""Parse a list of SSH public keys.""" """Parse a list of SSH public keys."""
return [parse_ssh_key(e) for e in string.splitlines()] return [parse_ssh_key(e) for e in string.splitlines()]
def shell_exec(cmdline: str, cwd: str) -> Tuple[int, str, str]:
args = shlex.split(cmdline)
proc = Popen(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
out, err = proc.communicate()
return (proc.returncode, out.decode().strip(), err.decode().strip())

View file

@ -131,6 +131,18 @@ packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz
pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz
userfile = /srv/http/aurweb/web/html/users.gz userfile = /srv/http/aurweb/web/html/users.gz
[git-archive]
author = git_archive.py
author-email = no-reply@archlinux.org
; One week worth of seconds (86400 * 7)
popularity-interval = 604800
metadata-repo = /srv/http/aurweb/metadata.git
users-repo = /srv/http/aurweb/users.git
pkgbases-repo = /srv/http/aurweb/pkgbases.git
pkgnames-repo = /srv/http/aurweb/pkgnames.git
[devel] [devel]
; commit_url is a format string used to produce a link to a commit hash. ; commit_url is a format string used to produce a link to a commit hash.
commit_url = https://gitlab.archlinux.org/archlinux/aurweb/-/commits/%s commit_url = https://gitlab.archlinux.org/archlinux/aurweb/-/commits/%s

View file

@ -76,5 +76,11 @@ packagesmetaextfile = /var/lib/aurweb/archives/packages-meta-ext-v1.json.gz
pkgbasefile = /var/lib/aurweb/archives/pkgbase.gz pkgbasefile = /var/lib/aurweb/archives/pkgbase.gz
userfile = /var/lib/aurweb/archives/users.gz userfile = /var/lib/aurweb/archives/users.gz
[git-archive]
metadata-repo = metadata.git
users-repo = users.git
pkgbases-repo = pkgbases.git
pkgnames-repo = pkgnames.git
[aurblup] [aurblup]
db-path = YOUR_AUR_ROOT/aurblup/ db-path = YOUR_AUR_ROOT/aurblup/

75
doc/git-archive.md Normal file
View file

@ -0,0 +1,75 @@
# aurweb Git Archive Specification
<span style="color: red">
WARNING: This aurweb Git Archive implementation is
experimental and may be changed.
</span>
## Overview
This git archive specification refers to the archive git repositories
created by [aurweb/scripts/git_archive.py](aurweb/scripts/git_archive.py)
using [spec modules](#spec-modules).
## Configuration
- `[git-archive]`
- `author`
- Git commit author
- `author-email`
- Git commit author email
See an [official spec](#official-specs)'s documentation for spec-specific
configurations.
## Fetch/Update Archives
When a client has not yet fetched any initial archives, they should clone
the repository:
$ git clone https://aur.archlinux.org/archive.git aurweb-archive
When updating, the repository is already cloned and changes need to be pulled
from remote:
# To update:
$ cd aurweb-archive && git pull
For end-user production applications, see
[Minimize Disk Space](#minimize-disk-space).
## Minimize Disk Space
Using `git gc` on the repository will compress revisions and remove
unreachable objects which grow the repository a considerable amount
each commit. It is recommended that the following command is used
after cloning the archive or pulling updates:
$ cd aurweb-archive && git gc --aggressive
## Spec Modules
Each aurweb spec module belongs to the `aurweb.archives.spec` package. For
example: a spec named "example" would be located at
`aurweb.archives.spec.example`.
[Official spec listings](#official-specs) use the following format:
- `spec_name`
- Spec description; what this spec produces
- `<link to repo documentation>`
### Official Specs
- [metadata](doc/specs/metadata.md)
- Package RPC `type=info` metadata
- [metadata-repo](repos/metadata-repo.md)
- [users](doc/specs/users.md)
- List of users found in the database
- [users-repo](repos/users-repo.md)
- [pkgbases](doc/specs/pkgbases.md)
- List of package bases found in the database
- [pkgbases-repo](repos/pkgbases-repo.md)
- [pkgnames](doc/specs/pkgnames.md)
- List of package names found in the database
- [pkgnames-repo](repos/pkgnames-repo.md)

View file

@ -70,20 +70,48 @@ computations and clean up the database:
* aurweb-pkgmaint automatically removes empty repositories that were created * aurweb-pkgmaint automatically removes empty repositories that were created
within the last 24 hours but never populated. within the last 24 hours but never populated.
* aurweb-mkpkglists generates the package list files; it takes an optional * [Deprecated] aurweb-mkpkglists generates the package list files; it takes
--extended flag, which additionally produces multiinfo metadata. It also an optional --extended flag, which additionally produces multiinfo metadata.
generates {archive.gz}.sha256 files that should be located within It also generates {archive.gz}.sha256 files that should be located within
mkpkglists.archivedir which contain a SHA-256 hash of their matching mkpkglists.archivedir which contain a SHA-256 hash of their matching
.gz counterpart. .gz counterpart.
* aurweb-usermaint removes the last login IP address of all users that did not * aurweb-usermaint removes the last login IP address of all users that did not
login within the past seven days. login within the past seven days.
* aurweb-git-archive generates Git repository archives based on a --spec.
This script is a new generation of aurweb-mkpkglists, which creates and
maintains Git repository versions of the archives produced by
aurweb-mkpkglists. See doc/git-archive.md for detailed documentation.
These scripts can be installed by running `poetry install` and are These scripts can be installed by running `poetry install` and are
usually scheduled using Cron. The current setup is: usually scheduled using Cron. The current setup is:
---- ----
*/5 * * * * poetry run aurweb-mkpkglists [--extended] # Run aurweb-git-archive --spec metadata directly after
# aurweb-mkpkglists so that they are executed sequentially, since
# both scripts are quite heavy. `aurweb-mkpkglists` should be removed
# from here once its deprecation period has ended.
*/5 * * * * poetry run aurweb-mkpkglists [--extended] && poetry run aurweb-git-archive --spec metadata
# Update popularity once an hour. This is done to reduce the amount
# of changes caused by popularity data. Even if a package is otherwise
# unchanged, popularity is recalculated every 5 minutes via aurweb-popupdate,
# which causes changes for a large chunk of packages.
#
# At this interval, clients can still take advantage of popularity
# data, but its updates are guarded behind hour-long intervals.
*/60 * * * * poetry run aurweb-git-archive --spec popularity
# Usernames
*/5 * * * * poetry run aurweb-git-archive --spec users
# Package base names
*/5 * * * * poetry run aurweb-git-archive --spec pkgbases
# Package names
*/5 * * * * poetry run aurweb-git-archive --spec pkgnames
1 */2 * * * poetry run aurweb-popupdate 1 */2 * * * poetry run aurweb-popupdate
2 */2 * * * poetry run aurweb-aurblup 2 */2 * * * poetry run aurweb-aurblup
3 */2 * * * poetry run aurweb-pkgmaint 3 */2 * * * poetry run aurweb-pkgmaint

121
doc/repos/metadata-repo.md Normal file
View file

@ -0,0 +1,121 @@
# Repository: metadata-repo
## Overview
The resulting repository contains RPC `type=info` JSON data for packages,
split into two different files:
- `pkgbase.json` contains details about each package base in the AUR
- `pkgname.json` contains details about each package in the AUR
See [Data](#data) for a breakdown of how data is presented in this
repository based off of a RPC `type=info` base.
See [File Layout](#file-layout) for a detailed summary of the layout
of these files and the data contained within.
**NOTE: `Popularity` now requires a client-side calculation, see [Popularity Calculation](#popularity-calculation).**
## Data
This repository contains RPC `type=info` data for all packages found
in AUR's database, reorganized to be suitable for Git repository
changes.
- `pkgname.json` holds Package-specific metadata
- Some fields have been removed from `pkgname.json` objects
- `ID`
- `PackageBaseID -> ID` (moved to `pkgbase.json`)
- `NumVotes` (moved to `pkgbase.json`)
- `Popularity` (moved to `pkgbase.json`)
- `pkgbase.json` holds PackageBase-specific metadata
- Package Base fields from `pkgname.json` have been moved over to
`pkgbase.json`
- `ID`
- `Keywords`
- `FirstSubmitted`
- `LastModified`
- `OutOfDate`
- `Maintainer`
- `URLPath`
- `NumVotes`
- `Popularity`
- `PopularityUpdated`
## Popularity Calculation
Clients intending to use popularity data from this archive **must**
perform a decay calculation on their end to reflect a close approximation
of up-to-date popularity.
Putting this step onto the client allows the server to maintain
less popularity record updates, dramatically improving archiving
of popularity data. The same calculation is done on the server-side
when producing outputs for RPC `type=info` and package pages.
```
Let T = Current UTC timestamp in seconds
Let PU = PopularityUpdated timestamp in seconds
# The delta between now and PU in days
Let D = (T - PU) / 86400
# Calculate up-to-date popularity:
P = Popularity * (0.98^D)
```
We can see that the resulting up-to-date popularity value decays as
the exponent is increased:
- `1.0 * (0.98^1) = 0.98`
- `1.0 * (0.98^2) = 0.96039999`
- ...
This decay calculation is essentially pushing back the date found for
votes by the exponent, which takes into account the time-factor. However,
since this calculation is based off of decimals and exponents, it
eventually becomes imprecise. The AUR updates these records on a forced
interval and whenever a vote is added to or removed from a particular package
to avoid imprecision from being an issue for clients
## File Layout
#### pkgbase.json:
{
"pkgbase1": {
"FirstSubmitted": 123456,
"ID": 1,
"LastModified": 123456,
"Maintainer": "kevr",
"OutOfDate": null,
"URLPath": "/cgit/aur.git/snapshot/pkgbase1.tar.gz",
"NumVotes": 1,
"Popularity": 1.0,
"PopularityUpdated": 12345567753.0
},
...
}
#### pkgname.json:
{
"pkg1": {
"CheckDepends": [], # Only included if a check dependency exists
"Conflicts": [], # Only included if a conflict exists
"Depends": [], # Only included if a dependency exists
"Description": "some description",
"Groups": [], # Only included if a group exists
"ID": 1,
"Keywords": [],
"License": [],
"MakeDepends": [], # Only included if a make dependency exists
"Name": "pkg1",
"OptDepends": [], # Only included if an opt dependency exists
"PackageBase": "pkgbase1",
"Provides": [], # Only included if `provides` is defined
"Replaces": [], # Only included if `replaces` is defined
"URL": "https://some_url.com",
"Version": "1.0-1"
},
...
}

View file

@ -0,0 +1,15 @@
# Repository: pkgbases-repo
## Overview
- `pkgbase.json` contains a list of package base names
## File Layout
### pkgbase.json:
[
"pkgbase1",
"pkgbase2",
...
]

View file

@ -0,0 +1,15 @@
# Repository: pkgnames-repo
## Overview
- `pkgname.json` contains a list of package names
## File Layout
### pkgname.json:
[
"pkgname1",
"pkgname2",
...
]

15
doc/repos/users-repo.md Normal file
View file

@ -0,0 +1,15 @@
# Repository: users-repo
## Overview
- `users.json` contains a list of usernames
## File Layout
### users.json:
[
"user1",
"user2",
...
]

14
doc/specs/metadata.md Normal file
View file

@ -0,0 +1,14 @@
# Git Archive Spec: metadata
## Configuration
- `[git-archive]`
- `metadata-repo`
- Path to package metadata git repository location
## Repositories
For documentation on each one of these repositories, follow their link,
which brings you to a topical markdown for that repository.
- [metadata-repo](doc/repos/metadata-repo.md)

14
doc/specs/pkgbases.md Normal file
View file

@ -0,0 +1,14 @@
# Git Archive Spec: pkgbases
## Configuration
- `[git-archive]`
- `pkgbases-repo`
- Path to pkgbases git repository location
## Repositories
For documentation on each one of these repositories, follow their link,
which brings you to a topical markdown for that repository.
- [pkgbases-repo](doc/repos/pkgbases-repo.md)

14
doc/specs/pkgnames.md Normal file
View file

@ -0,0 +1,14 @@
# Git Archive Spec: pkgnames
## Configuration
- `[git-archive]`
- `pkgnames-repo`
- Path to pkgnames git repository location
## Repositories
For documentation on each one of these repositories, follow their link,
which brings you to a topical markdown for that repository.
- [pkgnames-repo](doc/repos/pkgnames-repo.md)

14
doc/specs/popularity.md Normal file
View file

@ -0,0 +1,14 @@
# Git Archive Spec: popularity
## Configuration
- `[git-archive]`
- `popularity-repo`
- Path to popularity git repository location
## Repositories
For documentation on each one of these repositories, follow their link,
which brings you to a topical markdown for that repository.
- [popularity-repo](doc/repos/popularity-repo.md)

14
doc/specs/users.md Normal file
View file

@ -0,0 +1,14 @@
# Git Archive Spec: users
## Configuration
- `[git-archive]`
- `users-repo`
- Path to users git repository location
## Repositories
For documentation on each one of these repositories, follow their link,
which brings you to a topical markdown for that repository.
- [users-repo](doc/repos/users-repo.md)

View file

@ -0,0 +1,33 @@
"""add PopularityUpdated to PackageBase
Revision ID: 6441d3b65270
Revises: d64e5571bc8d
Create Date: 2022-09-22 18:08:03.280664
"""
from alembic import op
from sqlalchemy.exc import OperationalError
from aurweb.models.package_base import PackageBase
from aurweb.scripts import popupdate
# revision identifiers, used by Alembic.
revision = "6441d3b65270"
down_revision = "d64e5571bc8d"
branch_labels = None
depends_on = None
table = PackageBase.__table__
def upgrade():
try:
op.add_column(table.name, table.c.PopularityUpdated)
except OperationalError:
print(f"table '{table.name}' already exists, skipping migration")
popupdate.run_variable()
def downgrade():
op.drop_column(table.name, "PopularityUpdated")

View file

@ -117,3 +117,4 @@ aurweb-tuvotereminder = "aurweb.scripts.tuvotereminder:main"
aurweb-usermaint = "aurweb.scripts.usermaint:main" aurweb-usermaint = "aurweb.scripts.usermaint:main"
aurweb-config = "aurweb.scripts.config:main" aurweb-config = "aurweb.scripts.config:main"
aurweb-adduser = "aurweb.scripts.adduser:main" aurweb-adduser = "aurweb.scripts.adduser:main"
aurweb-git-archive = "aurweb.scripts.git_archive:main"

View file

@ -149,7 +149,7 @@
</tr> </tr>
<tr> <tr>
<th>{{ "Popularity" | tr }}:</th> <th>{{ "Popularity" | tr }}:</th>
<td>{{ pkgbase.Popularity | number_format(6 if pkgbase.Popularity <= 0.2 else 2) }}</td> <td>{{ popularity | number_format(6 if popularity <= 0.2 else 2) }}</td>
</tr> </tr>
<tr> <tr>
<th>{{ "First Submitted" | tr }}:</th> <th>{{ "First Submitted" | tr }}:</th>

241
test/test_git_archives.py Normal file
View file

@ -0,0 +1,241 @@
from http import HTTPStatus
from typing import Tuple
from unittest import mock
import py
import pygit2
import pytest
from fastapi.testclient import TestClient
from aurweb import asgi, config, db
from aurweb.archives.spec.base import GitInfo, SpecBase
from aurweb.models import Package, PackageBase, User
from aurweb.scripts import git_archive
from aurweb.testing.requests import Request
@pytest.fixture
def mock_metadata_archive(
tmp_path: py.path.local,
) -> Tuple[py.path.local, py.path.local]:
metadata_path = tmp_path / "metadata.git"
get_ = config.get
def mock_config(section: str, option: str) -> str:
if section == "git-archive":
if option == "metadata-repo":
return str(metadata_path)
return get_(section, option)
with mock.patch("aurweb.config.get", side_effect=mock_config):
yield metadata_path
@pytest.fixture
def mock_users_archive(tmp_path: py.path.local) -> py.path.local:
users_path = tmp_path / "users.git"
get_ = config.get
def mock_config(section: str, option: str) -> str:
if section == "git-archive":
if option == "users-repo":
return str(users_path)
return get_(section, option)
with mock.patch("aurweb.config.get", side_effect=mock_config):
yield users_path
@pytest.fixture
def mock_pkgbases_archive(tmp_path: py.path.local) -> py.path.local:
pkgbases_path = tmp_path / "pkgbases.git"
get_ = config.get
def mock_config(section: str, option: str) -> str:
if section == "git-archive":
if option == "pkgbases-repo":
return str(pkgbases_path)
return get_(section, option)
with mock.patch("aurweb.config.get", side_effect=mock_config):
yield pkgbases_path
@pytest.fixture
def mock_pkgnames_archive(tmp_path: py.path.local) -> py.path.local:
pkgnames_path = tmp_path / "pkgnames.git"
get_ = config.get
def mock_config(section: str, option: str) -> str:
if section == "git-archive":
if option == "pkgnames-repo":
return str(pkgnames_path)
return get_(section, option)
with mock.patch("aurweb.config.get", side_effect=mock_config):
yield pkgnames_path
@pytest.fixture
def metadata(mock_metadata_archive: py.path.local) -> py.path.local:
args = [__name__, "--spec", "metadata"]
with mock.patch("sys.argv", args):
yield mock_metadata_archive
@pytest.fixture
def users(mock_users_archive: py.path.local) -> py.path.local:
args = [__name__, "--spec", "users"]
with mock.patch("sys.argv", args):
yield mock_users_archive
@pytest.fixture
def pkgbases(mock_pkgbases_archive: py.path.local) -> py.path.local:
args = [__name__, "--spec", "pkgbases"]
with mock.patch("sys.argv", args):
yield mock_pkgbases_archive
@pytest.fixture
def pkgnames(mock_pkgnames_archive: py.path.local) -> py.path.local:
args = [__name__, "--spec", "pkgnames"]
with mock.patch("sys.argv", args):
yield mock_pkgnames_archive
@pytest.fixture
def client() -> TestClient:
yield TestClient(app=asgi.app)
@pytest.fixture
def user(db_test: None) -> User:
with db.begin():
user_ = db.create(
User,
Username="test",
Email="test@example.org",
Passwd="testPassword",
)
yield user_
@pytest.fixture
def package(user: User) -> Package:
with db.begin():
pkgbase_ = db.create(
PackageBase,
Name="test",
Maintainer=user,
Packager=user,
)
pkg_ = db.create(
Package,
PackageBase=pkgbase_,
Name="test",
)
yield pkg_
def commit_count(repo: pygit2.Repository) -> int:
commits = 0
for _ in repo.walk(repo.head.target):
commits += 1
return commits
def test_specbase_raises_notimplementederror():
spec = SpecBase()
with pytest.raises(NotImplementedError):
spec.generate()
def test_gitinfo_config(tmpdir: py.path.local):
path = tmpdir / "test.git"
git_info = GitInfo(path, {"user.name": "Test Person"})
git_archive.init_repository(git_info)
repo = pygit2.Repository(path)
assert repo.config["user.name"] == "Test Person"
def test_metadata(metadata: py.path.local, package: Package):
# Run main(), which creates mock_metadata_archive and commits current
# package data to it, exercising the "diff detected, committing" path
assert git_archive.main() == 0
repo = pygit2.Repository(metadata)
assert commit_count(repo) == 1
# Run main() again to exercise the "no diff detected" path
assert git_archive.main() == 0
repo = pygit2.Repository(metadata)
assert commit_count(repo) == 1
def test_metadata_change(
client: TestClient, metadata: py.path.local, user: User, package: Package
):
"""Test that metadata changes via aurweb cause git_archive to produce diffs."""
# Run main(), which creates mock_metadata_archive and commits current
# package data to it, exercising the "diff detected, committing" path
assert git_archive.main() == 0
repo = pygit2.Repository(metadata)
assert commit_count(repo) == 1
# Now, we modify `package`-related metadata via aurweb POST.
pkgbasename = package.PackageBase.Name
cookies = {"AURSID": user.login(Request(), "testPassword")}
with client as request:
endp = f"/pkgbase/{pkgbasename}/keywords"
post_data = {"keywords": "abc def"}
resp = request.post(endp, data=post_data, cookies=cookies, allow_redirects=True)
assert resp.status_code == HTTPStatus.OK
# Run main() again, which should now produce a new commit with the
# keyword changes we just made
assert git_archive.main() == 0
repo = pygit2.Repository(metadata)
assert commit_count(repo) == 2
def test_metadata_delete(client: TestClient, metadata: py.path.local, package: Package):
# Run main(), which creates mock_metadata_archive and commits current
# package data to it, exercising the "diff detected, committing" path
assert git_archive.main() == 0
repo = pygit2.Repository(metadata)
assert commit_count(repo) == 1
with db.begin():
db.delete(package)
# The deletion here should have caused a diff to be produced in git
assert git_archive.main() == 0
repo = pygit2.Repository(metadata)
assert commit_count(repo) == 2
def test_users(users: py.path.local, user: User):
assert git_archive.main() == 0
repo = pygit2.Repository(users)
assert commit_count(repo) == 1
def test_pkgbases(pkgbases: py.path.local, package: Package):
assert git_archive.main() == 0
repo = pygit2.Repository(pkgbases)
assert commit_count(repo) == 1
def test_pkgnames(pkgnames: py.path.local, package: Package):
assert git_archive.main() == 0
repo = pygit2.Repository(pkgnames)
assert commit_count(repo) == 1

View file

@ -9,6 +9,7 @@ from aurweb.filters import as_timezone, number_format, timestamp_to_datetime as
from aurweb.models import Package, PackageBase, User from aurweb.models import Package, PackageBase, User
from aurweb.models.account_type import USER_ID from aurweb.models.account_type import USER_ID
from aurweb.models.license import License from aurweb.models.license import License
from aurweb.models.package_base import popularity
from aurweb.models.package_license import PackageLicense from aurweb.models.package_license import PackageLicense
from aurweb.models.package_relation import PackageRelation from aurweb.models.package_relation import PackageRelation
from aurweb.models.relation_type import PROVIDES_ID, REPLACES_ID from aurweb.models.relation_type import PROVIDES_ID, REPLACES_ID
@ -287,12 +288,14 @@ def test_package_details(user: User, package: Package):
"""Test package details with most fields populated, but not all.""" """Test package details with most fields populated, but not all."""
request = Request(user=user, authenticated=True) request = Request(user=user, authenticated=True)
context = make_context(request, "Test Details") context = make_context(request, "Test Details")
context.update( context.update(
{ {
"request": request, "request": request,
"git_clone_uri_anon": GIT_CLONE_URI_ANON, "git_clone_uri_anon": GIT_CLONE_URI_ANON,
"git_clone_uri_priv": GIT_CLONE_URI_PRIV, "git_clone_uri_priv": GIT_CLONE_URI_PRIV,
"pkgbase": package.PackageBase, "pkgbase": package.PackageBase,
"popularity": popularity(package.PackageBase, time.utcnow()),
"package": package, "package": package,
"comaintainers": [], "comaintainers": [],
} }
@ -329,6 +332,7 @@ def test_package_details_filled(user: User, package: Package):
"git_clone_uri_anon": GIT_CLONE_URI_ANON, "git_clone_uri_anon": GIT_CLONE_URI_ANON,
"git_clone_uri_priv": GIT_CLONE_URI_PRIV, "git_clone_uri_priv": GIT_CLONE_URI_PRIV,
"pkgbase": package.PackageBase, "pkgbase": package.PackageBase,
"popularity": popularity(package.PackageBase, time.utcnow()),
"package": package, "package": package,
"comaintainers": [], "comaintainers": [],
"licenses": package.package_licenses, "licenses": package.package_licenses,