change(mkpkglists): converted to use aurweb.db ORM

- Improved speed dramatically
- Removed mkpkglists sharness

Signed-off-by: Kevin Morris <kevr@0cost.org>
This commit is contained in:
Kevin Morris 2021-11-21 00:47:48 -08:00
parent c59acbf6d6
commit 29c2d0de6b
No known key found for this signature in database
GPG key ID: F7E46DED420788F3
4 changed files with 403 additions and 180 deletions

21
aurweb/benchmark.py Normal file
View file

@ -0,0 +1,21 @@
from datetime import datetime
class Benchmark:
def __init__(self):
self.start()
def _timestamp(self) -> float:
""" Generate a timestamp. """
return float(datetime.utcnow().timestamp())
def start(self) -> int:
""" Start a benchmark. """
self.current = self._timestamp()
return self.current
def end(self):
""" Return the diff between now - start(). """
n = self._timestamp() - self.current
self.current = float(0)
return n

View file

@ -23,23 +23,28 @@ import os
import sys import sys
from collections import defaultdict from collections import defaultdict
from decimal import Decimal from typing import Any, Dict
import orjson import orjson
from sqlalchemy import literal, orm
import aurweb.config import aurweb.config
import aurweb.db
from aurweb import db, logging, models, util
from aurweb.benchmark import Benchmark
from aurweb.models import Package, PackageBase, User
logger = logging.get_logger("aurweb.scripts.mkpkglists")
archivedir = aurweb.config.get("mkpkglists", "archivedir") archivedir = aurweb.config.get("mkpkglists", "archivedir")
os.makedirs(archivedir, exist_ok=True) os.makedirs(archivedir, exist_ok=True)
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile') PACKAGES = aurweb.config.get('mkpkglists', 'packagesfile')
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile') META = aurweb.config.get('mkpkglists', 'packagesmetafile')
packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile') META_EXT = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
PKGBASE = aurweb.config.get('mkpkglists', 'pkgbasefile')
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile') USERS = aurweb.config.get('mkpkglists', 'userfile')
userfile = aurweb.config.get('mkpkglists', 'userfile')
TYPE_MAP = { TYPE_MAP = {
@ -53,7 +58,7 @@ TYPE_MAP = {
} }
def get_extended_dict(query: str): def get_extended_dict(query: orm.Query):
""" """
Produce data in the form in a single bulk SQL query: Produce data in the form in a single bulk SQL query:
@ -74,61 +79,75 @@ def get_extended_dict(query: str):
output[i].update(data.get(package_id)) output[i].update(data.get(package_id))
""" """
conn = aurweb.db.Connection()
cursor = conn.execute(query)
data = defaultdict(lambda: defaultdict(list)) data = defaultdict(lambda: defaultdict(list))
for result in cursor.fetchall(): for result in query:
pkgid = result[0] pkgid = result[0]
key = TYPE_MAP.get(result[1], result[1]) key = TYPE_MAP.get(result[1], result[1])
output = result[2] output = result[2]
if result[3]: if result[3]:
output += result[3] output += result[3]
# In all cases, we have at least an empty License list.
if "License" not in data[pkgid]:
data[pkgid]["License"] = []
# In all cases, we have at least an empty Keywords list.
if "Keywords" not in data[pkgid]:
data[pkgid]["Keywords"] = []
data[pkgid][key].append(output) data[pkgid][key].append(output)
conn.close()
return data return data
def get_extended_fields(): def get_extended_fields():
# Returns: [ID, Type, Name, Cond] subqueries = [
query = """ # PackageDependency
SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type, db.query(
PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond models.PackageDependency
FROM PackageDepends ).join(models.DependencyType).with_entities(
LEFT JOIN DependencyTypes models.PackageDependency.PackageID.label("ID"),
ON DependencyTypes.ID = PackageDepends.DepTypeID models.DependencyType.Name.label("Type"),
UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type, models.PackageDependency.DepName.label("Name"),
PackageRelations.RelName AS Name, models.PackageDependency.DepCondition.label("Cond")
PackageRelations.RelCondition AS Cond ).distinct().order_by("Name"),
FROM PackageRelations
LEFT JOIN RelationTypes # PackageRelation
ON RelationTypes.ID = PackageRelations.RelTypeID db.query(
UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type, models.PackageRelation
Groups.Name, '' AS Cond ).join(models.RelationType).with_entities(
FROM Groups models.PackageRelation.PackageID.label("ID"),
INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID models.RelationType.Name.label("Type"),
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type, models.PackageRelation.RelName.label("Name"),
Licenses.Name, '' as Cond models.PackageRelation.RelCondition.label("Cond")
FROM Licenses ).distinct().order_by("Name"),
INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type, # Groups
PackageKeywords.Keyword AS Name, '' as Cond db.query(models.PackageGroup).join(
FROM PackageKeywords models.Group,
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID models.PackageGroup.GroupID == models.Group.ID
""" ).with_entities(
models.PackageGroup.PackageID.label("ID"),
literal("Groups").label("Type"),
models.Group.Name.label("Name"),
literal(str()).label("Cond")
).distinct().order_by("Name"),
# Licenses
db.query(models.PackageLicense).join(
models.License,
models.PackageLicense.LicenseID == models.License.ID
).with_entities(
models.PackageLicense.PackageID.label("ID"),
literal("License").label("Type"),
models.License.Name.label("Name"),
literal(str()).label("Cond")
).distinct().order_by("Name"),
# Keywords
db.query(models.PackageKeyword).join(
models.Package,
Package.PackageBaseID == models.PackageKeyword.PackageBaseID
).with_entities(
models.Package.ID.label("ID"),
literal("Keywords").label("Type"),
models.PackageKeyword.Keyword.label("Name"),
literal(str()).label("Cond")
).distinct().order_by("Name")
]
query = subqueries[0].union_all(*subqueries[1:])
return get_extended_dict(query) return get_extended_dict(query)
@ -137,89 +156,122 @@ EXTENDED_FIELD_HANDLERS = {
} }
def is_decimal(column): def as_dict(package: Package) -> Dict[str, Any]:
""" Check if an SQL column is of decimal.Decimal type. """ return {
if isinstance(column, Decimal): "ID": package.ID,
return float(column) "Name": package.Name,
return column "PackageBaseID": package.PackageBaseID,
"PackageBase": package.PackageBase,
"Version": package.Version,
"Description": package.Description,
"NumVotes": package.NumVotes,
"Popularity": float(package.Popularity),
"OutOfDate": package.OutOfDate,
"Maintainer": package.Maintainer,
"FirstSubmitted": package.FirstSubmitted,
"LastModified": package.LastModified,
}
def write_archive(archive: str, output: list): def _main():
with gzip.open(archive, "wb") as f: bench = Benchmark()
f.write(b"[\n") logger.info("Started re-creating archives, wait a while...")
for i, item in enumerate(output):
f.write(orjson.dumps(item))
if i < len(output) - 1:
f.write(b",")
f.write(b"\n")
f.write(b"]")
query = db.query(Package).join(
def main(): PackageBase,
conn = aurweb.db.Connection() PackageBase.ID == Package.PackageBaseID
).join(
# Query columns; copied from RPC. User,
columns = ("Packages.ID, Packages.Name, " PackageBase.MaintainerUID == User.ID,
"PackageBases.ID AS PackageBaseID, " isouter=True
"PackageBases.Name AS PackageBase, " ).filter(PackageBase.PackagerUID.isnot(None)).with_entities(
"Version, Description, URL, NumVotes, " Package.ID,
"Popularity, OutOfDateTS AS OutOfDate, " Package.Name,
"Users.UserName AS Maintainer, " PackageBase.ID.label("PackageBaseID"),
"SubmittedTS AS FirstSubmitted, " PackageBase.Name.label("PackageBase"),
"ModifiedTS AS LastModified") Package.Version,
Package.Description,
# Perform query. PackageBase.NumVotes,
cur = conn.execute(f"SELECT {columns} FROM Packages " PackageBase.Popularity,
"LEFT JOIN PackageBases " PackageBase.OutOfDateTS.label("OutOfDate"),
"ON PackageBases.ID = Packages.PackageBaseID " User.Username.label("Maintainer"),
"LEFT JOIN Users " PackageBase.SubmittedTS.label("FirstSubmitted"),
"ON PackageBases.MaintainerUID = Users.ID " PackageBase.ModifiedTS.label("LastModified")
"WHERE PackageBases.PackagerUID IS NOT NULL") ).distinct().order_by("Name")
# Produce packages-meta-v1.json.gz # Produce packages-meta-v1.json.gz
output = list() output = list()
snapshot_uri = aurweb.config.get("options", "snapshot_uri") snapshot_uri = aurweb.config.get("options", "snapshot_uri")
for result in cur.fetchall(): gzips = {
item = { "packages": gzip.open(PACKAGES, "wt"),
column[0]: is_decimal(result[i]) "meta": gzip.open(META, "wb"),
for i, column in enumerate(cur.description)
} }
item["URLPath"] = snapshot_uri % item.get("Name")
output.append(item)
write_archive(packagesmetafile, output) # Append list opening to the metafile.
gzips["meta"].write(b"[\n")
# Produce packages-meta-ext-v1.json.gz # Produce packages.gz + packages-meta-ext-v1.json.gz
extended = False
if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS: if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
gzips["meta_ext"] = gzip.open(META_EXT, "wb")
# Append list opening to the meta_ext file.
gzips.get("meta_ext").write(b"[\n")
f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1]) f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
data = f() data = f()
extended = True
default_ = {"Groups": [], "License": [], "Keywords": []} results = query.all()
for i in range(len(output)): n = len(results) - 1
data_ = data.get(output[i].get("ID"), default_) for i, result in enumerate(results):
output[i].update(data_) # Append to packages.gz.
gzips.get("packages").write(f"{result.Name}\n")
write_archive(packagesmetaextfile, output) # Construct our result JSON dictionary.
item = as_dict(result)
item["URLPath"] = snapshot_uri % result.Name
# Produce packages.gz # We stream out package json objects line per line, so
with gzip.open(packagesfile, "wb") as f: # we also need to include the ',' character at the end
f.writelines([ # of package lines (excluding the last package).
bytes(x.get("Name") + "\n", "UTF-8") suffix = b",\n" if i < n else b'\n'
for x in output
]) # Write out to packagesmetafile
output.append(item)
gzips.get("meta").write(orjson.dumps(output[-1]) + suffix)
if extended:
# Write out to packagesmetaextfile.
data_ = data.get(result.ID, {})
output[-1].update(data_)
gzips.get("meta_ext").write(orjson.dumps(output[-1]) + suffix)
# Append the list closing to meta/meta_ext.
gzips.get("meta").write(b"]")
if extended:
gzips.get("meta_ext").write(b"]")
# Close gzip files.
util.apply_all(gzips.values(), lambda gz: gz.close())
# Produce pkgbase.gz # Produce pkgbase.gz
with gzip.open(pkgbasefile, "w") as f: query = db.query(PackageBase.Name).filter(
cur = conn.execute("SELECT Name FROM PackageBases " + PackageBase.PackagerUID.isnot(None)).all()
"WHERE PackagerUID IS NOT NULL") with gzip.open(PKGBASE, "wt") as f:
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) f.writelines([f"{base.Name}\n" for i, base in enumerate(query)])
# Produce users.gz # Produce users.gz
with gzip.open(userfile, "w") as f: query = db.query(User.Username).all()
cur = conn.execute("SELECT UserName FROM Users") with gzip.open(USERS, "wt") as f:
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) f.writelines([f"{user.Username}\n" for i, user in enumerate(query)])
conn.close() seconds = util.number_format(bench.end(), 4)
logger.info(f"Completed in {seconds} seconds.")
def main():
db.get_engine()
with db.begin():
_main()
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,65 +0,0 @@
#!/bin/sh
test_description='mkpkglists tests'
. "$(dirname "$0")/setup.sh"
test_expect_success 'Test package list generation with no packages.' '
echo "DELETE FROM Packages;" | sqlite3 aur.db &&
echo "DELETE FROM PackageBases;" | sqlite3 aur.db &&
cover "$MKPKGLISTS" &&
test $(zcat packages.gz | wc -l) -eq 0 &&
test $(zcat pkgbase.gz | wc -l) -eq 0
'
test_expect_success 'Test package list generation.' '
cat <<-EOD | sqlite3 aur.db &&
INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (1, "foobar", 1, 0, 0, "");
INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (2, "foobar2", 2, 0, 0, "");
INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (3, "foobar3", NULL, 0, 0, "");
INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (4, "foobar4", 1, 0, 0, "");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (1, 1, "pkg1");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (2, 1, "pkg2");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (3, 1, "pkg3");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (4, 2, "pkg4");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (5, 3, "pkg5");
EOD
cover "$MKPKGLISTS" &&
cat <<-EOD >expected &&
foobar
foobar2
foobar4
EOD
gunzip pkgbase.gz &&
sed "/^#/d" pkgbase >actual &&
test_cmp actual expected &&
cat <<-EOD >expected &&
pkg1
pkg2
pkg3
pkg4
EOD
gunzip packages.gz &&
sed "/^#/d" packages >actual &&
test_cmp actual expected
'
test_expect_success 'Test user list generation.' '
cover "$MKPKGLISTS" &&
cat <<-EOD >expected &&
dev
tu
tu2
tu3
tu4
user
user2
user3
user4
EOD
gunzip users.gz &&
sed "/^#/d" users >actual &&
test_cmp actual expected
'
test_done

215
test/test_mkpkglists.py Normal file
View file

@ -0,0 +1,215 @@
import json
from typing import List, Union
from unittest import mock
import pytest
from aurweb import config, db, util
from aurweb.models import License, Package, PackageBase, PackageDependency, PackageLicense, User
from aurweb.models.account_type import USER_ID
from aurweb.models.dependency_type import DEPENDS_ID
from aurweb.testing import noop
class FakeFile:
data = str()
__exit__ = noop
def __init__(self, modes: str) -> "FakeFile":
self.modes = modes
def __enter__(self, *args, **kwargs) -> "FakeFile":
return self
def write(self, data: Union[str, bytes]) -> None:
if isinstance(data, bytes):
data = data.decode()
self.data += data
def writelines(self, dataset: List[Union[str, bytes]]) -> None:
util.apply_all(dataset, self.write)
def close(self) -> None:
return
class MockGzipOpen:
def __init__(self):
self.gzips = dict()
def open(self, archive: str, modes: str):
self.gzips[archive] = FakeFile(modes)
return self.gzips.get(archive)
def get(self, key: str) -> FakeFile:
return self.gzips.get(key)
def __getitem__(self, key: str) -> FakeFile:
return self.get(key)
def __contains__(self, key: str) -> bool:
return key in self.gzips
def data(self, archive: str):
return self.get(archive).data
@pytest.fixture(autouse=True)
def setup(db_test):
config.rehash()
@pytest.fixture
def user() -> User:
with db.begin():
user = db.create(User, Username="test",
Email="test@example.org",
Passwd="testPassword",
AccountTypeID=USER_ID)
yield user
@pytest.fixture
def packages(user: User) -> List[Package]:
output = []
with db.begin():
lic = db.create(License, Name="GPL")
for i in range(5):
# Create the package.
pkgbase = db.create(PackageBase, Name=f"pkgbase_{i}",
Packager=user)
pkg = db.create(Package, PackageBase=pkgbase,
Name=f"pkg_{i}")
# Create some related records.
db.create(PackageLicense, Package=pkg, License=lic)
db.create(PackageDependency, DepTypeID=DEPENDS_ID,
Package=pkg, DepName=f"dep_{i}",
DepCondition=">=1.0")
# Add the package to our output list.
output.append(pkg)
# Sort output by the package name and return it.
yield sorted(output, key=lambda k: k.Name)
@mock.patch("os.makedirs", side_effect=noop)
def test_mkpkglists_empty(makedirs: mock.MagicMock):
gzips = MockGzipOpen()
with mock.patch("gzip.open", side_effect=gzips.open):
from aurweb.scripts import mkpkglists
mkpkglists.main()
archives = config.get_section("mkpkglists")
archives.pop("archivedir")
archives.pop("packagesmetaextfile")
for archive in archives.values():
assert archive in gzips
# Expect that packagesfile got created, but is empty because
# we have no DB records.
packages_file = archives.get("packagesfile")
assert gzips.data(packages_file) == str()
# Expect that pkgbasefile got created, but is empty because
# we have no DB records.
users_file = archives.get("pkgbasefile")
assert gzips.data(users_file) == str()
# Expect that userfile got created, but is empty because
# we have no DB records.
users_file = archives.get("userfile")
assert gzips.data(users_file) == str()
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetafile")
assert gzips.data(meta_file) == "[\n]"
@mock.patch("sys.argv", ["mkpkglists", "--extended"])
@mock.patch("os.makedirs", side_effect=noop)
def test_mkpkglists_extended_empty(makedirs: mock.MagicMock):
gzips = MockGzipOpen()
with mock.patch("gzip.open", side_effect=gzips.open):
from aurweb.scripts import mkpkglists
mkpkglists.main()
archives = config.get_section("mkpkglists")
archives.pop("archivedir")
for archive in archives.values():
assert archive in gzips
# Expect that packagesfile got created, but is empty because
# we have no DB records.
packages_file = archives.get("packagesfile")
assert gzips.data(packages_file) == str()
# Expect that pkgbasefile got created, but is empty because
# we have no DB records.
users_file = archives.get("pkgbasefile")
assert gzips.data(users_file) == str()
# Expect that userfile got created, but is empty because
# we have no DB records.
users_file = archives.get("userfile")
assert gzips.data(users_file) == str()
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetafile")
assert gzips.data(meta_file) == "[\n]"
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetaextfile")
assert gzips.data(meta_file) == "[\n]"
@mock.patch("sys.argv", ["mkpkglists", "--extended"])
@mock.patch("os.makedirs", side_effect=noop)
def test_mkpkglists_extended(makedirs: mock.MagicMock, user: User,
packages: List[Package]):
gzips = MockGzipOpen()
with mock.patch("gzip.open", side_effect=gzips.open):
from aurweb.scripts import mkpkglists
mkpkglists.main()
archives = config.get_section("mkpkglists")
archives.pop("archivedir")
for archive in archives.values():
assert archive in gzips
# Expect that packagesfile got created, but is empty because
# we have no DB records.
packages_file = archives.get("packagesfile")
expected = "\n".join([p.Name for p in packages]) + "\n"
assert gzips.data(packages_file) == expected
# Expect that pkgbasefile got created, but is empty because
# we have no DB records.
users_file = archives.get("pkgbasefile")
expected = "\n".join([p.PackageBase.Name for p in packages]) + "\n"
assert gzips.data(users_file) == expected
# Expect that userfile got created, but is empty because
# we have no DB records.
users_file = archives.get("userfile")
assert gzips.data(users_file) == "test\n"
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetafile")
data = json.loads(gzips.data(meta_file))
assert len(data) == 5
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetaextfile")
data = json.loads(gzips.data(meta_file))
assert len(data) == 5