change(mkpkglists): converted to use aurweb.db ORM

- Improved speed dramatically
- Removed mkpkglists sharness

Signed-off-by: Kevin Morris <kevr@0cost.org>
This commit is contained in:
Kevin Morris 2021-11-21 00:47:48 -08:00
parent c59acbf6d6
commit 29c2d0de6b
No known key found for this signature in database
GPG key ID: F7E46DED420788F3
4 changed files with 403 additions and 180 deletions

21
aurweb/benchmark.py Normal file
View file

@ -0,0 +1,21 @@
from datetime import datetime
class Benchmark:
def __init__(self):
self.start()
def _timestamp(self) -> float:
""" Generate a timestamp. """
return float(datetime.utcnow().timestamp())
def start(self) -> int:
""" Start a benchmark. """
self.current = self._timestamp()
return self.current
def end(self):
""" Return the diff between now - start(). """
n = self._timestamp() - self.current
self.current = float(0)
return n

View file

@ -23,23 +23,28 @@ import os
import sys
from collections import defaultdict
from decimal import Decimal
from typing import Any, Dict
import orjson
from sqlalchemy import literal, orm
import aurweb.config
import aurweb.db
from aurweb import db, logging, models, util
from aurweb.benchmark import Benchmark
from aurweb.models import Package, PackageBase, User
logger = logging.get_logger("aurweb.scripts.mkpkglists")
archivedir = aurweb.config.get("mkpkglists", "archivedir")
os.makedirs(archivedir, exist_ok=True)
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
userfile = aurweb.config.get('mkpkglists', 'userfile')
PACKAGES = aurweb.config.get('mkpkglists', 'packagesfile')
META = aurweb.config.get('mkpkglists', 'packagesmetafile')
META_EXT = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
PKGBASE = aurweb.config.get('mkpkglists', 'pkgbasefile')
USERS = aurweb.config.get('mkpkglists', 'userfile')
TYPE_MAP = {
@ -53,7 +58,7 @@ TYPE_MAP = {
}
def get_extended_dict(query: str):
def get_extended_dict(query: orm.Query):
"""
Produce data in the form in a single bulk SQL query:
@ -74,61 +79,75 @@ def get_extended_dict(query: str):
output[i].update(data.get(package_id))
"""
conn = aurweb.db.Connection()
cursor = conn.execute(query)
data = defaultdict(lambda: defaultdict(list))
for result in cursor.fetchall():
for result in query:
pkgid = result[0]
key = TYPE_MAP.get(result[1], result[1])
output = result[2]
if result[3]:
output += result[3]
# In all cases, we have at least an empty License list.
if "License" not in data[pkgid]:
data[pkgid]["License"] = []
# In all cases, we have at least an empty Keywords list.
if "Keywords" not in data[pkgid]:
data[pkgid]["Keywords"] = []
data[pkgid][key].append(output)
conn.close()
return data
def get_extended_fields():
# Returns: [ID, Type, Name, Cond]
query = """
SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type,
PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond
FROM PackageDepends
LEFT JOIN DependencyTypes
ON DependencyTypes.ID = PackageDepends.DepTypeID
UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type,
PackageRelations.RelName AS Name,
PackageRelations.RelCondition AS Cond
FROM PackageRelations
LEFT JOIN RelationTypes
ON RelationTypes.ID = PackageRelations.RelTypeID
UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
Groups.Name, '' AS Cond
FROM Groups
INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
Licenses.Name, '' as Cond
FROM Licenses
INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
PackageKeywords.Keyword AS Name, '' as Cond
FROM PackageKeywords
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
"""
subqueries = [
# PackageDependency
db.query(
models.PackageDependency
).join(models.DependencyType).with_entities(
models.PackageDependency.PackageID.label("ID"),
models.DependencyType.Name.label("Type"),
models.PackageDependency.DepName.label("Name"),
models.PackageDependency.DepCondition.label("Cond")
).distinct().order_by("Name"),
# PackageRelation
db.query(
models.PackageRelation
).join(models.RelationType).with_entities(
models.PackageRelation.PackageID.label("ID"),
models.RelationType.Name.label("Type"),
models.PackageRelation.RelName.label("Name"),
models.PackageRelation.RelCondition.label("Cond")
).distinct().order_by("Name"),
# Groups
db.query(models.PackageGroup).join(
models.Group,
models.PackageGroup.GroupID == models.Group.ID
).with_entities(
models.PackageGroup.PackageID.label("ID"),
literal("Groups").label("Type"),
models.Group.Name.label("Name"),
literal(str()).label("Cond")
).distinct().order_by("Name"),
# Licenses
db.query(models.PackageLicense).join(
models.License,
models.PackageLicense.LicenseID == models.License.ID
).with_entities(
models.PackageLicense.PackageID.label("ID"),
literal("License").label("Type"),
models.License.Name.label("Name"),
literal(str()).label("Cond")
).distinct().order_by("Name"),
# Keywords
db.query(models.PackageKeyword).join(
models.Package,
Package.PackageBaseID == models.PackageKeyword.PackageBaseID
).with_entities(
models.Package.ID.label("ID"),
literal("Keywords").label("Type"),
models.PackageKeyword.Keyword.label("Name"),
literal(str()).label("Cond")
).distinct().order_by("Name")
]
query = subqueries[0].union_all(*subqueries[1:])
return get_extended_dict(query)
@ -137,89 +156,122 @@ EXTENDED_FIELD_HANDLERS = {
}
def is_decimal(column):
""" Check if an SQL column is of decimal.Decimal type. """
if isinstance(column, Decimal):
return float(column)
return column
def as_dict(package: Package) -> Dict[str, Any]:
return {
"ID": package.ID,
"Name": package.Name,
"PackageBaseID": package.PackageBaseID,
"PackageBase": package.PackageBase,
"Version": package.Version,
"Description": package.Description,
"NumVotes": package.NumVotes,
"Popularity": float(package.Popularity),
"OutOfDate": package.OutOfDate,
"Maintainer": package.Maintainer,
"FirstSubmitted": package.FirstSubmitted,
"LastModified": package.LastModified,
}
def write_archive(archive: str, output: list):
with gzip.open(archive, "wb") as f:
f.write(b"[\n")
for i, item in enumerate(output):
f.write(orjson.dumps(item))
if i < len(output) - 1:
f.write(b",")
f.write(b"\n")
f.write(b"]")
def _main():
bench = Benchmark()
logger.info("Started re-creating archives, wait a while...")
def main():
conn = aurweb.db.Connection()
# Query columns; copied from RPC.
columns = ("Packages.ID, Packages.Name, "
"PackageBases.ID AS PackageBaseID, "
"PackageBases.Name AS PackageBase, "
"Version, Description, URL, NumVotes, "
"Popularity, OutOfDateTS AS OutOfDate, "
"Users.UserName AS Maintainer, "
"SubmittedTS AS FirstSubmitted, "
"ModifiedTS AS LastModified")
# Perform query.
cur = conn.execute(f"SELECT {columns} FROM Packages "
"LEFT JOIN PackageBases "
"ON PackageBases.ID = Packages.PackageBaseID "
"LEFT JOIN Users "
"ON PackageBases.MaintainerUID = Users.ID "
"WHERE PackageBases.PackagerUID IS NOT NULL")
query = db.query(Package).join(
PackageBase,
PackageBase.ID == Package.PackageBaseID
).join(
User,
PackageBase.MaintainerUID == User.ID,
isouter=True
).filter(PackageBase.PackagerUID.isnot(None)).with_entities(
Package.ID,
Package.Name,
PackageBase.ID.label("PackageBaseID"),
PackageBase.Name.label("PackageBase"),
Package.Version,
Package.Description,
PackageBase.NumVotes,
PackageBase.Popularity,
PackageBase.OutOfDateTS.label("OutOfDate"),
User.Username.label("Maintainer"),
PackageBase.SubmittedTS.label("FirstSubmitted"),
PackageBase.ModifiedTS.label("LastModified")
).distinct().order_by("Name")
# Produce packages-meta-v1.json.gz
output = list()
snapshot_uri = aurweb.config.get("options", "snapshot_uri")
for result in cur.fetchall():
item = {
column[0]: is_decimal(result[i])
for i, column in enumerate(cur.description)
gzips = {
"packages": gzip.open(PACKAGES, "wt"),
"meta": gzip.open(META, "wb"),
}
item["URLPath"] = snapshot_uri % item.get("Name")
output.append(item)
write_archive(packagesmetafile, output)
# Append list opening to the metafile.
gzips["meta"].write(b"[\n")
# Produce packages-meta-ext-v1.json.gz
# Produce packages.gz + packages-meta-ext-v1.json.gz
extended = False
if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
gzips["meta_ext"] = gzip.open(META_EXT, "wb")
# Append list opening to the meta_ext file.
gzips.get("meta_ext").write(b"[\n")
f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
data = f()
extended = True
default_ = {"Groups": [], "License": [], "Keywords": []}
for i in range(len(output)):
data_ = data.get(output[i].get("ID"), default_)
output[i].update(data_)
results = query.all()
n = len(results) - 1
for i, result in enumerate(results):
# Append to packages.gz.
gzips.get("packages").write(f"{result.Name}\n")
write_archive(packagesmetaextfile, output)
# Construct our result JSON dictionary.
item = as_dict(result)
item["URLPath"] = snapshot_uri % result.Name
# Produce packages.gz
with gzip.open(packagesfile, "wb") as f:
f.writelines([
bytes(x.get("Name") + "\n", "UTF-8")
for x in output
])
# We stream out package json objects line per line, so
# we also need to include the ',' character at the end
# of package lines (excluding the last package).
suffix = b",\n" if i < n else b'\n'
# Write out to packagesmetafile
output.append(item)
gzips.get("meta").write(orjson.dumps(output[-1]) + suffix)
if extended:
# Write out to packagesmetaextfile.
data_ = data.get(result.ID, {})
output[-1].update(data_)
gzips.get("meta_ext").write(orjson.dumps(output[-1]) + suffix)
# Append the list closing to meta/meta_ext.
gzips.get("meta").write(b"]")
if extended:
gzips.get("meta_ext").write(b"]")
# Close gzip files.
util.apply_all(gzips.values(), lambda gz: gz.close())
# Produce pkgbase.gz
with gzip.open(pkgbasefile, "w") as f:
cur = conn.execute("SELECT Name FROM PackageBases " +
"WHERE PackagerUID IS NOT NULL")
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
query = db.query(PackageBase.Name).filter(
PackageBase.PackagerUID.isnot(None)).all()
with gzip.open(PKGBASE, "wt") as f:
f.writelines([f"{base.Name}\n" for i, base in enumerate(query)])
# Produce users.gz
with gzip.open(userfile, "w") as f:
cur = conn.execute("SELECT UserName FROM Users")
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
query = db.query(User.Username).all()
with gzip.open(USERS, "wt") as f:
f.writelines([f"{user.Username}\n" for i, user in enumerate(query)])
conn.close()
seconds = util.number_format(bench.end(), 4)
logger.info(f"Completed in {seconds} seconds.")
def main():
db.get_engine()
with db.begin():
_main()
if __name__ == '__main__':

View file

@ -1,65 +0,0 @@
#!/bin/sh
test_description='mkpkglists tests'
. "$(dirname "$0")/setup.sh"
test_expect_success 'Test package list generation with no packages.' '
echo "DELETE FROM Packages;" | sqlite3 aur.db &&
echo "DELETE FROM PackageBases;" | sqlite3 aur.db &&
cover "$MKPKGLISTS" &&
test $(zcat packages.gz | wc -l) -eq 0 &&
test $(zcat pkgbase.gz | wc -l) -eq 0
'
test_expect_success 'Test package list generation.' '
cat <<-EOD | sqlite3 aur.db &&
INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (1, "foobar", 1, 0, 0, "");
INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (2, "foobar2", 2, 0, 0, "");
INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (3, "foobar3", NULL, 0, 0, "");
INSERT INTO PackageBases (ID, Name, PackagerUID, SubmittedTS, ModifiedTS, FlaggerComment) VALUES (4, "foobar4", 1, 0, 0, "");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (1, 1, "pkg1");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (2, 1, "pkg2");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (3, 1, "pkg3");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (4, 2, "pkg4");
INSERT INTO Packages (ID, PackageBaseID, Name) VALUES (5, 3, "pkg5");
EOD
cover "$MKPKGLISTS" &&
cat <<-EOD >expected &&
foobar
foobar2
foobar4
EOD
gunzip pkgbase.gz &&
sed "/^#/d" pkgbase >actual &&
test_cmp actual expected &&
cat <<-EOD >expected &&
pkg1
pkg2
pkg3
pkg4
EOD
gunzip packages.gz &&
sed "/^#/d" packages >actual &&
test_cmp actual expected
'
test_expect_success 'Test user list generation.' '
cover "$MKPKGLISTS" &&
cat <<-EOD >expected &&
dev
tu
tu2
tu3
tu4
user
user2
user3
user4
EOD
gunzip users.gz &&
sed "/^#/d" users >actual &&
test_cmp actual expected
'
test_done

215
test/test_mkpkglists.py Normal file
View file

@ -0,0 +1,215 @@
import json
from typing import List, Union
from unittest import mock
import pytest
from aurweb import config, db, util
from aurweb.models import License, Package, PackageBase, PackageDependency, PackageLicense, User
from aurweb.models.account_type import USER_ID
from aurweb.models.dependency_type import DEPENDS_ID
from aurweb.testing import noop
class FakeFile:
data = str()
__exit__ = noop
def __init__(self, modes: str) -> "FakeFile":
self.modes = modes
def __enter__(self, *args, **kwargs) -> "FakeFile":
return self
def write(self, data: Union[str, bytes]) -> None:
if isinstance(data, bytes):
data = data.decode()
self.data += data
def writelines(self, dataset: List[Union[str, bytes]]) -> None:
util.apply_all(dataset, self.write)
def close(self) -> None:
return
class MockGzipOpen:
def __init__(self):
self.gzips = dict()
def open(self, archive: str, modes: str):
self.gzips[archive] = FakeFile(modes)
return self.gzips.get(archive)
def get(self, key: str) -> FakeFile:
return self.gzips.get(key)
def __getitem__(self, key: str) -> FakeFile:
return self.get(key)
def __contains__(self, key: str) -> bool:
return key in self.gzips
def data(self, archive: str):
return self.get(archive).data
@pytest.fixture(autouse=True)
def setup(db_test):
config.rehash()
@pytest.fixture
def user() -> User:
with db.begin():
user = db.create(User, Username="test",
Email="test@example.org",
Passwd="testPassword",
AccountTypeID=USER_ID)
yield user
@pytest.fixture
def packages(user: User) -> List[Package]:
output = []
with db.begin():
lic = db.create(License, Name="GPL")
for i in range(5):
# Create the package.
pkgbase = db.create(PackageBase, Name=f"pkgbase_{i}",
Packager=user)
pkg = db.create(Package, PackageBase=pkgbase,
Name=f"pkg_{i}")
# Create some related records.
db.create(PackageLicense, Package=pkg, License=lic)
db.create(PackageDependency, DepTypeID=DEPENDS_ID,
Package=pkg, DepName=f"dep_{i}",
DepCondition=">=1.0")
# Add the package to our output list.
output.append(pkg)
# Sort output by the package name and return it.
yield sorted(output, key=lambda k: k.Name)
@mock.patch("os.makedirs", side_effect=noop)
def test_mkpkglists_empty(makedirs: mock.MagicMock):
gzips = MockGzipOpen()
with mock.patch("gzip.open", side_effect=gzips.open):
from aurweb.scripts import mkpkglists
mkpkglists.main()
archives = config.get_section("mkpkglists")
archives.pop("archivedir")
archives.pop("packagesmetaextfile")
for archive in archives.values():
assert archive in gzips
# Expect that packagesfile got created, but is empty because
# we have no DB records.
packages_file = archives.get("packagesfile")
assert gzips.data(packages_file) == str()
# Expect that pkgbasefile got created, but is empty because
# we have no DB records.
users_file = archives.get("pkgbasefile")
assert gzips.data(users_file) == str()
# Expect that userfile got created, but is empty because
# we have no DB records.
users_file = archives.get("userfile")
assert gzips.data(users_file) == str()
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetafile")
assert gzips.data(meta_file) == "[\n]"
@mock.patch("sys.argv", ["mkpkglists", "--extended"])
@mock.patch("os.makedirs", side_effect=noop)
def test_mkpkglists_extended_empty(makedirs: mock.MagicMock):
gzips = MockGzipOpen()
with mock.patch("gzip.open", side_effect=gzips.open):
from aurweb.scripts import mkpkglists
mkpkglists.main()
archives = config.get_section("mkpkglists")
archives.pop("archivedir")
for archive in archives.values():
assert archive in gzips
# Expect that packagesfile got created, but is empty because
# we have no DB records.
packages_file = archives.get("packagesfile")
assert gzips.data(packages_file) == str()
# Expect that pkgbasefile got created, but is empty because
# we have no DB records.
users_file = archives.get("pkgbasefile")
assert gzips.data(users_file) == str()
# Expect that userfile got created, but is empty because
# we have no DB records.
users_file = archives.get("userfile")
assert gzips.data(users_file) == str()
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetafile")
assert gzips.data(meta_file) == "[\n]"
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetaextfile")
assert gzips.data(meta_file) == "[\n]"
@mock.patch("sys.argv", ["mkpkglists", "--extended"])
@mock.patch("os.makedirs", side_effect=noop)
def test_mkpkglists_extended(makedirs: mock.MagicMock, user: User,
packages: List[Package]):
gzips = MockGzipOpen()
with mock.patch("gzip.open", side_effect=gzips.open):
from aurweb.scripts import mkpkglists
mkpkglists.main()
archives = config.get_section("mkpkglists")
archives.pop("archivedir")
for archive in archives.values():
assert archive in gzips
# Expect that packagesfile got created, but is empty because
# we have no DB records.
packages_file = archives.get("packagesfile")
expected = "\n".join([p.Name for p in packages]) + "\n"
assert gzips.data(packages_file) == expected
# Expect that pkgbasefile got created, but is empty because
# we have no DB records.
users_file = archives.get("pkgbasefile")
expected = "\n".join([p.PackageBase.Name for p in packages]) + "\n"
assert gzips.data(users_file) == expected
# Expect that userfile got created, but is empty because
# we have no DB records.
users_file = archives.get("userfile")
assert gzips.data(users_file) == "test\n"
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetafile")
data = json.loads(gzips.data(meta_file))
assert len(data) == 5
# Expect that packagesmetafile got created, but is empty because
# we have no DB records; it's still a valid empty JSON list.
meta_file = archives.get("packagesmetaextfile")
data = json.loads(gzips.data(meta_file))
assert len(data) == 5