fix(mkpkglists): improve package meta archive

The SQL logic in this file for package metadata now exactly
reflects RPC's search logic, without searching for specific
packages.

Two command line arguments are available:

    --extended | Include License, Keywords, Groups, relations
                 and dependencies.

When --extended is passed, the script will create a
packages-meta-ext-v1.json.gz, configured via packagesmetaextfile.

Archive JSON is in the following format: line-separated package objects
enclosed in a list:

    [
    {...},
    {...},
    {...}
    ]

Signed-off-by: Kevin Morris <kevr@0cost.org>
This commit is contained in:
Kevin Morris 2021-10-31 16:52:30 -07:00
parent f606140050
commit f3f662c696
No known key found for this signature in database
GPG key ID: F7E46DED420788F3
4 changed files with 255 additions and 24 deletions

View file

@ -1,16 +1,192 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""
Produces package, package base and user archives for the AUR
database.
Archives:
packages.gz | A line-separated list of package names
packages-meta-v1.json | A type=search RPC-formatted JSON dataset
packages-meta-ext-v1.json | An --extended archive
pkgbase.gz | A line-separated list of package base names
users.gz | A line-separated list of user names
This script takes an optional argument: --extended. Based
on the following, right-hand side fields are added to each item.
--extended | License, Keywords, Groups, relations and dependencies
"""
import datetime import datetime
import gzip import gzip
import json import os
import sys
from collections import defaultdict
from decimal import Decimal
from typing import Tuple
import orjson
import aurweb.config import aurweb.config
import aurweb.db import aurweb.db
def state_path(archive: str) -> str:
# A hard-coded /tmp state directory.
# TODO: Use Redis cache to store this state after we merge
# FastAPI into master and removed PHP from the tree.
return os.path.join("/tmp", os.path.basename(archive) + ".state")
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile') packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile') packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
packages_state = state_path(packagesfile)
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile') pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
pkgbases_state = state_path(pkgbasefile)
userfile = aurweb.config.get('mkpkglists', 'userfile') userfile = aurweb.config.get('mkpkglists', 'userfile')
users_state = state_path(userfile)
def should_update(state: str, tablename: str) -> Tuple[bool, int]:
if aurweb.config.get("database", "backend") != "mysql":
return (False, 0)
db_name = aurweb.config.get("database", "name")
conn = aurweb.db.Connection()
cur = conn.execute("SELECT auto_increment FROM information_schema.tables "
"WHERE table_schema = ? AND table_name = ?",
(db_name, tablename,))
update_time = cur.fetchone()[0]
saved_update_time = 0
if os.path.exists(state):
with open(state) as f:
saved_update_time = int(f.read().strip())
return (saved_update_time == update_time, update_time)
def update_state(state: str, update_time: int) -> None:
with open(state, "w") as f:
f.write(str(update_time))
TYPE_MAP = {
"depends": "Depends",
"makedepends": "MakeDepends",
"checkdepends": "CheckDepends",
"optdepends": "OptDepends",
"conflicts": "Conflicts",
"provides": "Provides",
"replaces": "Replaces",
}
def get_extended_dict(query: str):
"""
Produce data in the form in a single bulk SQL query:
{
<integer_package_id>: {
"Depends": [...],
"Conflicts": [...],
"License": [...]
}
}
The caller can then use this data to populate a dataset of packages.
output = produce_base_output_data()
data = get_extended_dict(query)
for i in range(len(output)):
package_id = output[i].get("ID")
output[i].update(data.get(package_id))
"""
conn = aurweb.db.Connection()
cursor = conn.execute(query)
data = defaultdict(lambda: defaultdict(list))
for result in cursor.fetchall():
pkgid = result[0]
key = TYPE_MAP.get(result[1])
output = result[2]
if result[3]:
output += result[3]
# In all cases, we have at least an empty License list.
if "License" not in data[pkgid]:
data[pkgid]["License"] = []
# In all cases, we have at least an empty Keywords list.
if "Keywords" not in data[pkgid]:
data[pkgid]["Keywords"] = []
data[pkgid][key].append(output)
conn.close()
return data
def get_extended_fields():
# Returns: [ID, Type, Name, Cond]
query = """
SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type,
PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond
FROM PackageDepends
LEFT JOIN DependencyTypes
ON DependencyTypes.ID = PackageDepends.DepTypeID
UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type,
PackageRelations.RelName AS Name,
PackageRelations.RelCondition AS Cond
FROM PackageRelations
LEFT JOIN RelationTypes
ON RelationTypes.ID = PackageRelations.RelTypeID
UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
Groups.Name, '' AS Cond
FROM Groups
INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
Licenses.Name, '' as Cond
FROM Licenses
INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
PackageKeywords.Keyword AS Name, '' as Cond
FROM PackageKeywords
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
"""
return get_extended_dict(query)
EXTENDED_FIELD_HANDLERS = {
"--extended": get_extended_fields
}
def is_decimal(column):
""" Check if an SQL column is of decimal.Decimal type. """
if isinstance(column, Decimal):
return float(column)
return column
def write_archive(archive: str, output: list):
with gzip.open(archive, "wb") as f:
f.write(b"[\n")
for i, item in enumerate(output):
f.write(orjson.dumps(item))
if i < len(output) - 1:
f.write(b",")
f.write(b"\n")
f.write(b"]")
def main(): def main():
@ -21,32 +197,83 @@ def main():
pkgbaselist_header = "# AUR package base list, generated on " + datestr pkgbaselist_header = "# AUR package base list, generated on " + datestr
userlist_header = "# AUR user name list, generated on " + datestr userlist_header = "# AUR user name list, generated on " + datestr
with gzip.open(packagesfile, "w") as f: updated, update_time = should_update(packages_state, "Packages")
f.write(bytes(pkglist_header + "\n", "UTF-8")) if not updated:
cur = conn.execute("SELECT Packages.Name FROM Packages " + print("Updating Packages...")
"INNER JOIN PackageBases " +
"ON PackageBases.ID = Packages.PackageBaseID " + # Query columns; copied from RPC.
columns = ("Packages.ID, Packages.Name, "
"PackageBases.ID AS PackageBaseID, "
"PackageBases.Name AS PackageBase, "
"Version, Description, URL, NumVotes, "
"Popularity, OutOfDateTS AS OutOfDate, "
"Users.UserName AS Maintainer, "
"SubmittedTS AS FirstSubmitted, "
"ModifiedTS AS LastModified")
# Perform query.
cur = conn.execute(f"SELECT {columns} FROM Packages "
"LEFT JOIN PackageBases "
"ON PackageBases.ID = Packages.PackageBaseID "
"LEFT JOIN Users "
"ON PackageBases.MaintainerUID = Users.ID "
"WHERE PackageBases.PackagerUID IS NOT NULL") "WHERE PackageBases.PackagerUID IS NOT NULL")
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
with gzip.open(packagesmetafile, "wt") as f: # Produce packages-meta-v1.json.gz
cur = conn.execute("SELECT * FROM Packages") output = list()
json.dump({ snapshot_uri = aurweb.config.get("options", "snapshot_uri")
"warning": "This is a experimental! It can be removed or modified without warning!", for result in cur.fetchall():
"columns": [d[0] for d in cur.description], item = {
"data": cur.fetchall() column[0]: is_decimal(result[i])
}, f) for i, column in enumerate(cur.description)
}
item["URLPath"] = snapshot_uri % item.get("Name")
output.append(item)
write_archive(packagesmetafile, output)
# Produce packages-meta-ext-v1.json.gz
if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
data = f()
default_ = {"Groups": [], "License": [], "Keywords": []}
for i in range(len(output)):
data_ = data.get(output[i].get("ID"), default_)
output[i].update(data_)
write_archive(packagesmetaextfile, output)
# Produce packages.gz
with gzip.open(packagesfile, "wb") as f:
f.write(bytes(pkglist_header + "\n", "UTF-8"))
f.writelines([
bytes(x.get("Name") + "\n", "UTF-8")
for x in output
])
update_state(packages_state, update_time)
updated, update_time = should_update(pkgbases_state, "PackageBases")
if not updated:
print("Updating PackageBases...")
# Produce pkgbase.gz
with gzip.open(pkgbasefile, "w") as f: with gzip.open(pkgbasefile, "w") as f:
f.write(bytes(pkgbaselist_header + "\n", "UTF-8")) f.write(bytes(pkgbaselist_header + "\n", "UTF-8"))
cur = conn.execute("SELECT Name FROM PackageBases " + cur = conn.execute("SELECT Name FROM PackageBases " +
"WHERE PackagerUID IS NOT NULL") "WHERE PackagerUID IS NOT NULL")
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
update_state(pkgbases_state, update_time)
updated, update_time = should_update(users_state, "Users")
if not updated:
print("Updating Users...")
# Produce users.gz
with gzip.open(userfile, "w") as f: with gzip.open(userfile, "w") as f:
f.write(bytes(userlist_header + "\n", "UTF-8")) f.write(bytes(userlist_header + "\n", "UTF-8"))
cur = conn.execute("SELECT UserName FROM Users") cur = conn.execute("SELECT UserName FROM Users")
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
update_state(users_state, update_time)
conn.close() conn.close()

View file

@ -103,6 +103,7 @@ server = https://mirrors.kernel.org/archlinux/%s/os/x86_64
[mkpkglists] [mkpkglists]
packagesfile = /srv/http/aurweb/web/html/packages.gz packagesfile = /srv/http/aurweb/web/html/packages.gz
packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz
packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz
pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz
userfile = /srv/http/aurweb/web/html/users.gz userfile = /srv/http/aurweb/web/html/users.gz

View file

@ -37,6 +37,7 @@ enable-maintenance = 0
maintenance-exceptions = 127.0.0.1 maintenance-exceptions = 127.0.0.1
commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s
localedir = $TOPLEVEL/web/locale/ localedir = $TOPLEVEL/web/locale/
snapshot_uri = /cgit/aur.git/snapshot/%s.tar.gz
[notifications] [notifications]
notify-cmd = $NOTIFY notify-cmd = $NOTIFY
@ -68,6 +69,7 @@ server = file://$(pwd)/remote/
[mkpkglists] [mkpkglists]
packagesfile = packages.gz packagesfile = packages.gz
packagesmetafile = packages-meta-v1.json.gz packagesmetafile = packages-meta-v1.json.gz
packagesmetaextfile = packages-meta-ext-v1.json.gz
pkgbasefile = pkgbase.gz pkgbasefile = pkgbase.gz
userfile = users.gz userfile = users.gz
EOF EOF

View file

@ -189,7 +189,8 @@ if (!empty($tokens[1]) && '/' . $tokens[1] == get_pkg_route()) {
readfile("./$path"); readfile("./$path");
break; break;
case "/packages.gz": case "/packages.gz":
case "/packages-teapot.json.gz": case "/packages-meta-v1.json.gz":
case "/packages-meta-ext-v1.json.gz":
case "/pkgbase.gz": case "/pkgbase.gz":
case "/users.gz": case "/users.gz":
header("Content-Type: text/plain"); header("Content-Type: text/plain");