From 29c2d0de6b83a2287ffdb885d9b55aa63b1d4792 Mon Sep 17 00:00:00 2001 From: Kevin Morris Date: Sun, 21 Nov 2021 00:47:48 -0800 Subject: [PATCH] change(mkpkglists): converted to use aurweb.db ORM - Improved speed dramatically - Removed mkpkglists sharness Signed-off-by: Kevin Morris --- aurweb/benchmark.py | 21 +++ aurweb/scripts/mkpkglists.py | 282 +++++++++++++++++++++-------------- test/t2100-mkpkglists.t | 65 -------- test/test_mkpkglists.py | 215 ++++++++++++++++++++++++++ 4 files changed, 403 insertions(+), 180 deletions(-) create mode 100644 aurweb/benchmark.py delete mode 100755 test/t2100-mkpkglists.t create mode 100644 test/test_mkpkglists.py diff --git a/aurweb/benchmark.py b/aurweb/benchmark.py new file mode 100644 index 00000000..7086fb08 --- /dev/null +++ b/aurweb/benchmark.py @@ -0,0 +1,21 @@ +from datetime import datetime + + +class Benchmark: + def __init__(self): + self.start() + + def _timestamp(self) -> float: + """ Generate a timestamp. """ + return float(datetime.utcnow().timestamp()) + + def start(self) -> int: + """ Start a benchmark. """ + self.current = self._timestamp() + return self.current + + def end(self): + """ Return the diff between now - start(). """ + n = self._timestamp() - self.current + self.current = float(0) + return n diff --git a/aurweb/scripts/mkpkglists.py b/aurweb/scripts/mkpkglists.py index 307b2b12..92de7931 100755 --- a/aurweb/scripts/mkpkglists.py +++ b/aurweb/scripts/mkpkglists.py @@ -23,23 +23,28 @@ import os import sys from collections import defaultdict -from decimal import Decimal +from typing import Any, Dict import orjson +from sqlalchemy import literal, orm + import aurweb.config -import aurweb.db + +from aurweb import db, logging, models, util +from aurweb.benchmark import Benchmark +from aurweb.models import Package, PackageBase, User + +logger = logging.get_logger("aurweb.scripts.mkpkglists") archivedir = aurweb.config.get("mkpkglists", "archivedir") os.makedirs(archivedir, exist_ok=True) -packagesfile = aurweb.config.get('mkpkglists', 'packagesfile') -packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile') -packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile') - -pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile') - -userfile = aurweb.config.get('mkpkglists', 'userfile') +PACKAGES = aurweb.config.get('mkpkglists', 'packagesfile') +META = aurweb.config.get('mkpkglists', 'packagesmetafile') +META_EXT = aurweb.config.get('mkpkglists', 'packagesmetaextfile') +PKGBASE = aurweb.config.get('mkpkglists', 'pkgbasefile') +USERS = aurweb.config.get('mkpkglists', 'userfile') TYPE_MAP = { @@ -53,7 +58,7 @@ TYPE_MAP = { } -def get_extended_dict(query: str): +def get_extended_dict(query: orm.Query): """ Produce data in the form in a single bulk SQL query: @@ -74,61 +79,75 @@ def get_extended_dict(query: str): output[i].update(data.get(package_id)) """ - conn = aurweb.db.Connection() - - cursor = conn.execute(query) - data = defaultdict(lambda: defaultdict(list)) - for result in cursor.fetchall(): - + for result in query: pkgid = result[0] key = TYPE_MAP.get(result[1], result[1]) output = result[2] if result[3]: output += result[3] - - # In all cases, we have at least an empty License list. - if "License" not in data[pkgid]: - data[pkgid]["License"] = [] - - # In all cases, we have at least an empty Keywords list. - if "Keywords" not in data[pkgid]: - data[pkgid]["Keywords"] = [] - data[pkgid][key].append(output) - conn.close() return data def get_extended_fields(): - # Returns: [ID, Type, Name, Cond] - query = """ - SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type, - PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond - FROM PackageDepends - LEFT JOIN DependencyTypes - ON DependencyTypes.ID = PackageDepends.DepTypeID - UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type, - PackageRelations.RelName AS Name, - PackageRelations.RelCondition AS Cond - FROM PackageRelations - LEFT JOIN RelationTypes - ON RelationTypes.ID = PackageRelations.RelTypeID - UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type, - Groups.Name, '' AS Cond - FROM Groups - INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID - UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type, - Licenses.Name, '' as Cond - FROM Licenses - INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID - UNION SELECT Packages.ID AS ID, 'Keywords' AS Type, - PackageKeywords.Keyword AS Name, '' as Cond - FROM PackageKeywords - INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID - """ + subqueries = [ + # PackageDependency + db.query( + models.PackageDependency + ).join(models.DependencyType).with_entities( + models.PackageDependency.PackageID.label("ID"), + models.DependencyType.Name.label("Type"), + models.PackageDependency.DepName.label("Name"), + models.PackageDependency.DepCondition.label("Cond") + ).distinct().order_by("Name"), + + # PackageRelation + db.query( + models.PackageRelation + ).join(models.RelationType).with_entities( + models.PackageRelation.PackageID.label("ID"), + models.RelationType.Name.label("Type"), + models.PackageRelation.RelName.label("Name"), + models.PackageRelation.RelCondition.label("Cond") + ).distinct().order_by("Name"), + + # Groups + db.query(models.PackageGroup).join( + models.Group, + models.PackageGroup.GroupID == models.Group.ID + ).with_entities( + models.PackageGroup.PackageID.label("ID"), + literal("Groups").label("Type"), + models.Group.Name.label("Name"), + literal(str()).label("Cond") + ).distinct().order_by("Name"), + + # Licenses + db.query(models.PackageLicense).join( + models.License, + models.PackageLicense.LicenseID == models.License.ID + ).with_entities( + models.PackageLicense.PackageID.label("ID"), + literal("License").label("Type"), + models.License.Name.label("Name"), + literal(str()).label("Cond") + ).distinct().order_by("Name"), + + # Keywords + db.query(models.PackageKeyword).join( + models.Package, + Package.PackageBaseID == models.PackageKeyword.PackageBaseID + ).with_entities( + models.Package.ID.label("ID"), + literal("Keywords").label("Type"), + models.PackageKeyword.Keyword.label("Name"), + literal(str()).label("Cond") + ).distinct().order_by("Name") + ] + query = subqueries[0].union_all(*subqueries[1:]) return get_extended_dict(query) @@ -137,89 +156,122 @@ EXTENDED_FIELD_HANDLERS = { } -def is_decimal(column): - """ Check if an SQL column is of decimal.Decimal type. """ - if isinstance(column, Decimal): - return float(column) - return column +def as_dict(package: Package) -> Dict[str, Any]: + return { + "ID": package.ID, + "Name": package.Name, + "PackageBaseID": package.PackageBaseID, + "PackageBase": package.PackageBase, + "Version": package.Version, + "Description": package.Description, + "NumVotes": package.NumVotes, + "Popularity": float(package.Popularity), + "OutOfDate": package.OutOfDate, + "Maintainer": package.Maintainer, + "FirstSubmitted": package.FirstSubmitted, + "LastModified": package.LastModified, + } -def write_archive(archive: str, output: list): - with gzip.open(archive, "wb") as f: - f.write(b"[\n") - for i, item in enumerate(output): - f.write(orjson.dumps(item)) - if i < len(output) - 1: - f.write(b",") - f.write(b"\n") - f.write(b"]") +def _main(): + bench = Benchmark() + logger.info("Started re-creating archives, wait a while...") - -def main(): - conn = aurweb.db.Connection() - - # Query columns; copied from RPC. - columns = ("Packages.ID, Packages.Name, " - "PackageBases.ID AS PackageBaseID, " - "PackageBases.Name AS PackageBase, " - "Version, Description, URL, NumVotes, " - "Popularity, OutOfDateTS AS OutOfDate, " - "Users.UserName AS Maintainer, " - "SubmittedTS AS FirstSubmitted, " - "ModifiedTS AS LastModified") - - # Perform query. - cur = conn.execute(f"SELECT {columns} FROM Packages " - "LEFT JOIN PackageBases " - "ON PackageBases.ID = Packages.PackageBaseID " - "LEFT JOIN Users " - "ON PackageBases.MaintainerUID = Users.ID " - "WHERE PackageBases.PackagerUID IS NOT NULL") + query = db.query(Package).join( + PackageBase, + PackageBase.ID == Package.PackageBaseID + ).join( + User, + PackageBase.MaintainerUID == User.ID, + isouter=True + ).filter(PackageBase.PackagerUID.isnot(None)).with_entities( + Package.ID, + Package.Name, + PackageBase.ID.label("PackageBaseID"), + PackageBase.Name.label("PackageBase"), + Package.Version, + Package.Description, + PackageBase.NumVotes, + PackageBase.Popularity, + PackageBase.OutOfDateTS.label("OutOfDate"), + User.Username.label("Maintainer"), + PackageBase.SubmittedTS.label("FirstSubmitted"), + PackageBase.ModifiedTS.label("LastModified") + ).distinct().order_by("Name") # Produce packages-meta-v1.json.gz output = list() snapshot_uri = aurweb.config.get("options", "snapshot_uri") - for result in cur.fetchall(): - item = { - column[0]: is_decimal(result[i]) - for i, column in enumerate(cur.description) - } - item["URLPath"] = snapshot_uri % item.get("Name") - output.append(item) + gzips = { + "packages": gzip.open(PACKAGES, "wt"), + "meta": gzip.open(META, "wb"), + } - write_archive(packagesmetafile, output) + # Append list opening to the metafile. + gzips["meta"].write(b"[\n") - # Produce packages-meta-ext-v1.json.gz + # Produce packages.gz + packages-meta-ext-v1.json.gz + extended = False if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS: + gzips["meta_ext"] = gzip.open(META_EXT, "wb") + # Append list opening to the meta_ext file. + gzips.get("meta_ext").write(b"[\n") f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1]) data = f() + extended = True - default_ = {"Groups": [], "License": [], "Keywords": []} - for i in range(len(output)): - data_ = data.get(output[i].get("ID"), default_) - output[i].update(data_) + results = query.all() + n = len(results) - 1 + for i, result in enumerate(results): + # Append to packages.gz. + gzips.get("packages").write(f"{result.Name}\n") - write_archive(packagesmetaextfile, output) + # Construct our result JSON dictionary. + item = as_dict(result) + item["URLPath"] = snapshot_uri % result.Name - # Produce packages.gz - with gzip.open(packagesfile, "wb") as f: - f.writelines([ - bytes(x.get("Name") + "\n", "UTF-8") - for x in output - ]) + # We stream out package json objects line per line, so + # we also need to include the ',' character at the end + # of package lines (excluding the last package). + suffix = b",\n" if i < n else b'\n' + + # Write out to packagesmetafile + output.append(item) + gzips.get("meta").write(orjson.dumps(output[-1]) + suffix) + + if extended: + # Write out to packagesmetaextfile. + data_ = data.get(result.ID, {}) + output[-1].update(data_) + gzips.get("meta_ext").write(orjson.dumps(output[-1]) + suffix) + + # Append the list closing to meta/meta_ext. + gzips.get("meta").write(b"]") + if extended: + gzips.get("meta_ext").write(b"]") + + # Close gzip files. + util.apply_all(gzips.values(), lambda gz: gz.close()) # Produce pkgbase.gz - with gzip.open(pkgbasefile, "w") as f: - cur = conn.execute("SELECT Name FROM PackageBases " + - "WHERE PackagerUID IS NOT NULL") - f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) + query = db.query(PackageBase.Name).filter( + PackageBase.PackagerUID.isnot(None)).all() + with gzip.open(PKGBASE, "wt") as f: + f.writelines([f"{base.Name}\n" for i, base in enumerate(query)]) # Produce users.gz - with gzip.open(userfile, "w") as f: - cur = conn.execute("SELECT UserName FROM Users") - f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) + query = db.query(User.Username).all() + with gzip.open(USERS, "wt") as f: + f.writelines([f"{user.Username}\n" for i, user in enumerate(query)]) - conn.close() + seconds = util.number_format(bench.end(), 4) + logger.info(f"Completed in {seconds} seconds.") + + +def main(): + db.get_engine() + with db.begin(): + _main() if __name__ == '__main__': diff --git a/test/t2100-mkpkglists.t b/test/t2100-mkpkglists.t deleted file mode 100755 index d217c4f6..00000000 --- a/test/t2100-mkpkglists.t +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/sh - -test_description='mkpkglists tests' - -. "$(dirname "$0")/setup.sh" - -test_expect_success 'Test package list generation with no packages.' ' - echo "DELETE FROM Packages;" | sqlite3 aur.db && - echo "DELETE FROM PackageBases;" | sqlite3 aur.db && - cover "$MKPKGLISTS" && - test $(zcat packages.gz | wc -l) -eq 0 && - test $(zcat pkgbase.gz | wc -l) -eq 0 -' - -test_expect_success 'Test package list generation.' ' - cat <<-EOD | sqlite3 aur.db && - INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (1, "foobar", 1, 0, 0, ""); - INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (2, "foobar2", 2, 0, 0, ""); - INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (3, "foobar3", NULL, 0, 0, ""); - INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (4, "foobar4", 1, 0, 0, ""); - INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (1, 1, "pkg1"); - INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (2, 1, "pkg2"); - INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (3, 1, "pkg3"); - INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (4, 2, "pkg4"); - INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (5, 3, "pkg5"); - EOD - cover "$MKPKGLISTS" && - cat <<-EOD >expected && - foobar - foobar2 - foobar4 - EOD - gunzip pkgbase.gz && - sed "/^#/d" pkgbase >actual && - test_cmp actual expected && - cat <<-EOD >expected && - pkg1 - pkg2 - pkg3 - pkg4 - EOD - gunzip packages.gz && - sed "/^#/d" packages >actual && - test_cmp actual expected -' - -test_expect_success 'Test user list generation.' ' - cover "$MKPKGLISTS" && - cat <<-EOD >expected && - dev - tu - tu2 - tu3 - tu4 - user - user2 - user3 - user4 - EOD - gunzip users.gz && - sed "/^#/d" users >actual && - test_cmp actual expected -' - -test_done diff --git a/test/test_mkpkglists.py b/test/test_mkpkglists.py new file mode 100644 index 00000000..ee66e4e1 --- /dev/null +++ b/test/test_mkpkglists.py @@ -0,0 +1,215 @@ +import json + +from typing import List, Union +from unittest import mock + +import pytest + +from aurweb import config, db, util +from aurweb.models import License, Package, PackageBase, PackageDependency, PackageLicense, User +from aurweb.models.account_type import USER_ID +from aurweb.models.dependency_type import DEPENDS_ID +from aurweb.testing import noop + + +class FakeFile: + data = str() + __exit__ = noop + + def __init__(self, modes: str) -> "FakeFile": + self.modes = modes + + def __enter__(self, *args, **kwargs) -> "FakeFile": + return self + + def write(self, data: Union[str, bytes]) -> None: + if isinstance(data, bytes): + data = data.decode() + self.data += data + + def writelines(self, dataset: List[Union[str, bytes]]) -> None: + util.apply_all(dataset, self.write) + + def close(self) -> None: + return + + +class MockGzipOpen: + def __init__(self): + self.gzips = dict() + + def open(self, archive: str, modes: str): + self.gzips[archive] = FakeFile(modes) + return self.gzips.get(archive) + + def get(self, key: str) -> FakeFile: + return self.gzips.get(key) + + def __getitem__(self, key: str) -> FakeFile: + return self.get(key) + + def __contains__(self, key: str) -> bool: + return key in self.gzips + + def data(self, archive: str): + return self.get(archive).data + + +@pytest.fixture(autouse=True) +def setup(db_test): + config.rehash() + + +@pytest.fixture +def user() -> User: + with db.begin(): + user = db.create(User, Username="test", + Email="test@example.org", + Passwd="testPassword", + AccountTypeID=USER_ID) + yield user + + +@pytest.fixture +def packages(user: User) -> List[Package]: + output = [] + with db.begin(): + lic = db.create(License, Name="GPL") + for i in range(5): + # Create the package. + pkgbase = db.create(PackageBase, Name=f"pkgbase_{i}", + Packager=user) + pkg = db.create(Package, PackageBase=pkgbase, + Name=f"pkg_{i}") + + # Create some related records. + db.create(PackageLicense, Package=pkg, License=lic) + db.create(PackageDependency, DepTypeID=DEPENDS_ID, + Package=pkg, DepName=f"dep_{i}", + DepCondition=">=1.0") + + # Add the package to our output list. + output.append(pkg) + + # Sort output by the package name and return it. + yield sorted(output, key=lambda k: k.Name) + + +@mock.patch("os.makedirs", side_effect=noop) +def test_mkpkglists_empty(makedirs: mock.MagicMock): + gzips = MockGzipOpen() + with mock.patch("gzip.open", side_effect=gzips.open): + from aurweb.scripts import mkpkglists + mkpkglists.main() + + archives = config.get_section("mkpkglists") + archives.pop("archivedir") + archives.pop("packagesmetaextfile") + + for archive in archives.values(): + assert archive in gzips + + # Expect that packagesfile got created, but is empty because + # we have no DB records. + packages_file = archives.get("packagesfile") + assert gzips.data(packages_file) == str() + + # Expect that pkgbasefile got created, but is empty because + # we have no DB records. + users_file = archives.get("pkgbasefile") + assert gzips.data(users_file) == str() + + # Expect that userfile got created, but is empty because + # we have no DB records. + users_file = archives.get("userfile") + assert gzips.data(users_file) == str() + + # Expect that packagesmetafile got created, but is empty because + # we have no DB records; it's still a valid empty JSON list. + meta_file = archives.get("packagesmetafile") + assert gzips.data(meta_file) == "[\n]" + + +@mock.patch("sys.argv", ["mkpkglists", "--extended"]) +@mock.patch("os.makedirs", side_effect=noop) +def test_mkpkglists_extended_empty(makedirs: mock.MagicMock): + gzips = MockGzipOpen() + with mock.patch("gzip.open", side_effect=gzips.open): + from aurweb.scripts import mkpkglists + mkpkglists.main() + + archives = config.get_section("mkpkglists") + archives.pop("archivedir") + + for archive in archives.values(): + assert archive in gzips + + # Expect that packagesfile got created, but is empty because + # we have no DB records. + packages_file = archives.get("packagesfile") + assert gzips.data(packages_file) == str() + + # Expect that pkgbasefile got created, but is empty because + # we have no DB records. + users_file = archives.get("pkgbasefile") + assert gzips.data(users_file) == str() + + # Expect that userfile got created, but is empty because + # we have no DB records. + users_file = archives.get("userfile") + assert gzips.data(users_file) == str() + + # Expect that packagesmetafile got created, but is empty because + # we have no DB records; it's still a valid empty JSON list. + meta_file = archives.get("packagesmetafile") + assert gzips.data(meta_file) == "[\n]" + + # Expect that packagesmetafile got created, but is empty because + # we have no DB records; it's still a valid empty JSON list. + meta_file = archives.get("packagesmetaextfile") + assert gzips.data(meta_file) == "[\n]" + + +@mock.patch("sys.argv", ["mkpkglists", "--extended"]) +@mock.patch("os.makedirs", side_effect=noop) +def test_mkpkglists_extended(makedirs: mock.MagicMock, user: User, + packages: List[Package]): + gzips = MockGzipOpen() + with mock.patch("gzip.open", side_effect=gzips.open): + from aurweb.scripts import mkpkglists + mkpkglists.main() + + archives = config.get_section("mkpkglists") + archives.pop("archivedir") + + for archive in archives.values(): + assert archive in gzips + + # Expect that packagesfile got created, but is empty because + # we have no DB records. + packages_file = archives.get("packagesfile") + expected = "\n".join([p.Name for p in packages]) + "\n" + assert gzips.data(packages_file) == expected + + # Expect that pkgbasefile got created, but is empty because + # we have no DB records. + users_file = archives.get("pkgbasefile") + expected = "\n".join([p.PackageBase.Name for p in packages]) + "\n" + assert gzips.data(users_file) == expected + + # Expect that userfile got created, but is empty because + # we have no DB records. + users_file = archives.get("userfile") + assert gzips.data(users_file) == "test\n" + + # Expect that packagesmetafile got created, but is empty because + # we have no DB records; it's still a valid empty JSON list. + meta_file = archives.get("packagesmetafile") + data = json.loads(gzips.data(meta_file)) + assert len(data) == 5 + + # Expect that packagesmetafile got created, but is empty because + # we have no DB records; it's still a valid empty JSON list. + meta_file = archives.get("packagesmetaextfile") + data = json.loads(gzips.data(meta_file)) + assert len(data) == 5