aurweb/aurweb/scripts/mkpkglists.py
Kevin Morris abbecf5194
change(mkpkglists): remove header comments
These comments change every time mkpkglists is run; which
would invalidate the ETag headers disbursed by the gzip
host. This commit removes those changing headers.

Signed-off-by: Kevin Morris <kevr@0cost.org>
2021-11-10 07:39:17 -08:00

222 lines
6.6 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Produces package, package base and user archives for the AUR
database.
Archives:
packages.gz | A line-separated list of package names
packages-meta-v1.json | A type=search RPC-formatted JSON dataset
packages-meta-ext-v1.json | An --extended archive
pkgbase.gz | A line-separated list of package base names
users.gz | A line-separated list of user names
This script takes an optional argument: --extended. Based
on the following, right-hand side fields are added to each item.
--extended | License, Keywords, Groups, relations and dependencies
"""
import gzip
import sys
from collections import defaultdict
from decimal import Decimal
import orjson
import aurweb.config
import aurweb.db
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
userfile = aurweb.config.get('mkpkglists', 'userfile')
TYPE_MAP = {
"depends": "Depends",
"makedepends": "MakeDepends",
"checkdepends": "CheckDepends",
"optdepends": "OptDepends",
"conflicts": "Conflicts",
"provides": "Provides",
"replaces": "Replaces",
}
def get_extended_dict(query: str):
"""
Produce data in the form in a single bulk SQL query:
{
<integer_package_id>: {
"Depends": [...],
"Conflicts": [...],
"License": [...]
}
}
The caller can then use this data to populate a dataset of packages.
output = produce_base_output_data()
data = get_extended_dict(query)
for i in range(len(output)):
package_id = output[i].get("ID")
output[i].update(data.get(package_id))
"""
conn = aurweb.db.Connection()
cursor = conn.execute(query)
data = defaultdict(lambda: defaultdict(list))
for result in cursor.fetchall():
pkgid = result[0]
key = TYPE_MAP.get(result[1])
output = result[2]
if result[3]:
output += result[3]
# In all cases, we have at least an empty License list.
if "License" not in data[pkgid]:
data[pkgid]["License"] = []
# In all cases, we have at least an empty Keywords list.
if "Keywords" not in data[pkgid]:
data[pkgid]["Keywords"] = []
data[pkgid][key].append(output)
conn.close()
return data
def get_extended_fields():
# Returns: [ID, Type, Name, Cond]
query = """
SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type,
PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond
FROM PackageDepends
LEFT JOIN DependencyTypes
ON DependencyTypes.ID = PackageDepends.DepTypeID
UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type,
PackageRelations.RelName AS Name,
PackageRelations.RelCondition AS Cond
FROM PackageRelations
LEFT JOIN RelationTypes
ON RelationTypes.ID = PackageRelations.RelTypeID
UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
Groups.Name, '' AS Cond
FROM Groups
INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
Licenses.Name, '' as Cond
FROM Licenses
INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
PackageKeywords.Keyword AS Name, '' as Cond
FROM PackageKeywords
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
"""
return get_extended_dict(query)
EXTENDED_FIELD_HANDLERS = {
"--extended": get_extended_fields
}
def is_decimal(column):
""" Check if an SQL column is of decimal.Decimal type. """
if isinstance(column, Decimal):
return float(column)
return column
def write_archive(archive: str, output: list):
with gzip.open(archive, "wb") as f:
f.write(b"[\n")
for i, item in enumerate(output):
f.write(orjson.dumps(item))
if i < len(output) - 1:
f.write(b",")
f.write(b"\n")
f.write(b"]")
def main():
conn = aurweb.db.Connection()
# Query columns; copied from RPC.
columns = ("Packages.ID, Packages.Name, "
"PackageBases.ID AS PackageBaseID, "
"PackageBases.Name AS PackageBase, "
"Version, Description, URL, NumVotes, "
"Popularity, OutOfDateTS AS OutOfDate, "
"Users.UserName AS Maintainer, "
"SubmittedTS AS FirstSubmitted, "
"ModifiedTS AS LastModified")
# Perform query.
cur = conn.execute(f"SELECT {columns} FROM Packages "
"LEFT JOIN PackageBases "
"ON PackageBases.ID = Packages.PackageBaseID "
"LEFT JOIN Users "
"ON PackageBases.MaintainerUID = Users.ID "
"WHERE PackageBases.PackagerUID IS NOT NULL")
# Produce packages-meta-v1.json.gz
output = list()
snapshot_uri = aurweb.config.get("options", "snapshot_uri")
for result in cur.fetchall():
item = {
column[0]: is_decimal(result[i])
for i, column in enumerate(cur.description)
}
item["URLPath"] = snapshot_uri % item.get("Name")
output.append(item)
write_archive(packagesmetafile, output)
# Produce packages-meta-ext-v1.json.gz
if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
data = f()
default_ = {"Groups": [], "License": [], "Keywords": []}
for i in range(len(output)):
data_ = data.get(output[i].get("ID"), default_)
output[i].update(data_)
write_archive(packagesmetaextfile, output)
# Produce packages.gz
with gzip.open(packagesfile, "wb") as f:
f.writelines([
bytes(x.get("Name") + "\n", "UTF-8")
for x in output
])
# Produce pkgbase.gz
with gzip.open(pkgbasefile, "w") as f:
cur = conn.execute("SELECT Name FROM PackageBases " +
"WHERE PackagerUID IS NOT NULL")
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
# Produce users.gz
with gzip.open(userfile, "w") as f:
cur = conn.execute("SELECT UserName FROM Users")
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
conn.close()
if __name__ == '__main__':
main()