mirror of
https://gitlab.archlinux.org/archlinux/aurweb.git
synced 2025-02-03 10:43:03 +01:00
fix(mkpkglists): improve package meta archive
The SQL logic in this file for package metadata now exactly reflects RPC's search logic, without searching for specific packages. Two command line arguments are available: --extended | Include License, Keywords, Groups, relations and dependencies. When --extended is passed, the script will create a packages-meta-ext-v1.json.gz, configured via packagesmetaextfile. Archive JSON is in the following format: line-separated package objects enclosed in a list: [ {...}, {...}, {...} ] Signed-off-by: Kevin Morris <kevr@0cost.org>
This commit is contained in:
parent
f606140050
commit
f3f662c696
4 changed files with 255 additions and 24 deletions
|
@ -1,16 +1,192 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Produces package, package base and user archives for the AUR
|
||||||
|
database.
|
||||||
|
|
||||||
|
Archives:
|
||||||
|
|
||||||
|
packages.gz | A line-separated list of package names
|
||||||
|
packages-meta-v1.json | A type=search RPC-formatted JSON dataset
|
||||||
|
packages-meta-ext-v1.json | An --extended archive
|
||||||
|
pkgbase.gz | A line-separated list of package base names
|
||||||
|
users.gz | A line-separated list of user names
|
||||||
|
|
||||||
|
This script takes an optional argument: --extended. Based
|
||||||
|
on the following, right-hand side fields are added to each item.
|
||||||
|
|
||||||
|
--extended | License, Keywords, Groups, relations and dependencies
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import gzip
|
import gzip
|
||||||
import json
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from decimal import Decimal
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import orjson
|
||||||
|
|
||||||
import aurweb.config
|
import aurweb.config
|
||||||
import aurweb.db
|
import aurweb.db
|
||||||
|
|
||||||
|
|
||||||
|
def state_path(archive: str) -> str:
|
||||||
|
# A hard-coded /tmp state directory.
|
||||||
|
# TODO: Use Redis cache to store this state after we merge
|
||||||
|
# FastAPI into master and removed PHP from the tree.
|
||||||
|
return os.path.join("/tmp", os.path.basename(archive) + ".state")
|
||||||
|
|
||||||
|
|
||||||
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
|
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
|
||||||
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
|
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
|
||||||
|
packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
|
||||||
|
packages_state = state_path(packagesfile)
|
||||||
|
|
||||||
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
|
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
|
||||||
|
pkgbases_state = state_path(pkgbasefile)
|
||||||
|
|
||||||
userfile = aurweb.config.get('mkpkglists', 'userfile')
|
userfile = aurweb.config.get('mkpkglists', 'userfile')
|
||||||
|
users_state = state_path(userfile)
|
||||||
|
|
||||||
|
|
||||||
|
def should_update(state: str, tablename: str) -> Tuple[bool, int]:
|
||||||
|
if aurweb.config.get("database", "backend") != "mysql":
|
||||||
|
return (False, 0)
|
||||||
|
|
||||||
|
db_name = aurweb.config.get("database", "name")
|
||||||
|
conn = aurweb.db.Connection()
|
||||||
|
cur = conn.execute("SELECT auto_increment FROM information_schema.tables "
|
||||||
|
"WHERE table_schema = ? AND table_name = ?",
|
||||||
|
(db_name, tablename,))
|
||||||
|
update_time = cur.fetchone()[0]
|
||||||
|
|
||||||
|
saved_update_time = 0
|
||||||
|
if os.path.exists(state):
|
||||||
|
with open(state) as f:
|
||||||
|
saved_update_time = int(f.read().strip())
|
||||||
|
|
||||||
|
return (saved_update_time == update_time, update_time)
|
||||||
|
|
||||||
|
|
||||||
|
def update_state(state: str, update_time: int) -> None:
|
||||||
|
with open(state, "w") as f:
|
||||||
|
f.write(str(update_time))
|
||||||
|
|
||||||
|
|
||||||
|
TYPE_MAP = {
|
||||||
|
"depends": "Depends",
|
||||||
|
"makedepends": "MakeDepends",
|
||||||
|
"checkdepends": "CheckDepends",
|
||||||
|
"optdepends": "OptDepends",
|
||||||
|
"conflicts": "Conflicts",
|
||||||
|
"provides": "Provides",
|
||||||
|
"replaces": "Replaces",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_extended_dict(query: str):
|
||||||
|
"""
|
||||||
|
Produce data in the form in a single bulk SQL query:
|
||||||
|
|
||||||
|
{
|
||||||
|
<integer_package_id>: {
|
||||||
|
"Depends": [...],
|
||||||
|
"Conflicts": [...],
|
||||||
|
"License": [...]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
The caller can then use this data to populate a dataset of packages.
|
||||||
|
|
||||||
|
output = produce_base_output_data()
|
||||||
|
data = get_extended_dict(query)
|
||||||
|
for i in range(len(output)):
|
||||||
|
package_id = output[i].get("ID")
|
||||||
|
output[i].update(data.get(package_id))
|
||||||
|
"""
|
||||||
|
|
||||||
|
conn = aurweb.db.Connection()
|
||||||
|
|
||||||
|
cursor = conn.execute(query)
|
||||||
|
|
||||||
|
data = defaultdict(lambda: defaultdict(list))
|
||||||
|
|
||||||
|
for result in cursor.fetchall():
|
||||||
|
|
||||||
|
pkgid = result[0]
|
||||||
|
key = TYPE_MAP.get(result[1])
|
||||||
|
output = result[2]
|
||||||
|
if result[3]:
|
||||||
|
output += result[3]
|
||||||
|
|
||||||
|
# In all cases, we have at least an empty License list.
|
||||||
|
if "License" not in data[pkgid]:
|
||||||
|
data[pkgid]["License"] = []
|
||||||
|
|
||||||
|
# In all cases, we have at least an empty Keywords list.
|
||||||
|
if "Keywords" not in data[pkgid]:
|
||||||
|
data[pkgid]["Keywords"] = []
|
||||||
|
|
||||||
|
data[pkgid][key].append(output)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_extended_fields():
|
||||||
|
# Returns: [ID, Type, Name, Cond]
|
||||||
|
query = """
|
||||||
|
SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type,
|
||||||
|
PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond
|
||||||
|
FROM PackageDepends
|
||||||
|
LEFT JOIN DependencyTypes
|
||||||
|
ON DependencyTypes.ID = PackageDepends.DepTypeID
|
||||||
|
UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type,
|
||||||
|
PackageRelations.RelName AS Name,
|
||||||
|
PackageRelations.RelCondition AS Cond
|
||||||
|
FROM PackageRelations
|
||||||
|
LEFT JOIN RelationTypes
|
||||||
|
ON RelationTypes.ID = PackageRelations.RelTypeID
|
||||||
|
UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
|
||||||
|
Groups.Name, '' AS Cond
|
||||||
|
FROM Groups
|
||||||
|
INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID
|
||||||
|
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
|
||||||
|
Licenses.Name, '' as Cond
|
||||||
|
FROM Licenses
|
||||||
|
INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
|
||||||
|
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
|
||||||
|
PackageKeywords.Keyword AS Name, '' as Cond
|
||||||
|
FROM PackageKeywords
|
||||||
|
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
|
||||||
|
"""
|
||||||
|
return get_extended_dict(query)
|
||||||
|
|
||||||
|
|
||||||
|
EXTENDED_FIELD_HANDLERS = {
|
||||||
|
"--extended": get_extended_fields
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def is_decimal(column):
|
||||||
|
""" Check if an SQL column is of decimal.Decimal type. """
|
||||||
|
if isinstance(column, Decimal):
|
||||||
|
return float(column)
|
||||||
|
return column
|
||||||
|
|
||||||
|
|
||||||
|
def write_archive(archive: str, output: list):
|
||||||
|
with gzip.open(archive, "wb") as f:
|
||||||
|
f.write(b"[\n")
|
||||||
|
for i, item in enumerate(output):
|
||||||
|
f.write(orjson.dumps(item))
|
||||||
|
if i < len(output) - 1:
|
||||||
|
f.write(b",")
|
||||||
|
f.write(b"\n")
|
||||||
|
f.write(b"]")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -21,32 +197,83 @@ def main():
|
||||||
pkgbaselist_header = "# AUR package base list, generated on " + datestr
|
pkgbaselist_header = "# AUR package base list, generated on " + datestr
|
||||||
userlist_header = "# AUR user name list, generated on " + datestr
|
userlist_header = "# AUR user name list, generated on " + datestr
|
||||||
|
|
||||||
with gzip.open(packagesfile, "w") as f:
|
updated, update_time = should_update(packages_state, "Packages")
|
||||||
f.write(bytes(pkglist_header + "\n", "UTF-8"))
|
if not updated:
|
||||||
cur = conn.execute("SELECT Packages.Name FROM Packages " +
|
print("Updating Packages...")
|
||||||
"INNER JOIN PackageBases " +
|
|
||||||
"ON PackageBases.ID = Packages.PackageBaseID " +
|
# Query columns; copied from RPC.
|
||||||
|
columns = ("Packages.ID, Packages.Name, "
|
||||||
|
"PackageBases.ID AS PackageBaseID, "
|
||||||
|
"PackageBases.Name AS PackageBase, "
|
||||||
|
"Version, Description, URL, NumVotes, "
|
||||||
|
"Popularity, OutOfDateTS AS OutOfDate, "
|
||||||
|
"Users.UserName AS Maintainer, "
|
||||||
|
"SubmittedTS AS FirstSubmitted, "
|
||||||
|
"ModifiedTS AS LastModified")
|
||||||
|
|
||||||
|
# Perform query.
|
||||||
|
cur = conn.execute(f"SELECT {columns} FROM Packages "
|
||||||
|
"LEFT JOIN PackageBases "
|
||||||
|
"ON PackageBases.ID = Packages.PackageBaseID "
|
||||||
|
"LEFT JOIN Users "
|
||||||
|
"ON PackageBases.MaintainerUID = Users.ID "
|
||||||
"WHERE PackageBases.PackagerUID IS NOT NULL")
|
"WHERE PackageBases.PackagerUID IS NOT NULL")
|
||||||
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
|
|
||||||
|
|
||||||
with gzip.open(packagesmetafile, "wt") as f:
|
# Produce packages-meta-v1.json.gz
|
||||||
cur = conn.execute("SELECT * FROM Packages")
|
output = list()
|
||||||
json.dump({
|
snapshot_uri = aurweb.config.get("options", "snapshot_uri")
|
||||||
"warning": "This is a experimental! It can be removed or modified without warning!",
|
for result in cur.fetchall():
|
||||||
"columns": [d[0] for d in cur.description],
|
item = {
|
||||||
"data": cur.fetchall()
|
column[0]: is_decimal(result[i])
|
||||||
}, f)
|
for i, column in enumerate(cur.description)
|
||||||
|
}
|
||||||
|
item["URLPath"] = snapshot_uri % item.get("Name")
|
||||||
|
output.append(item)
|
||||||
|
|
||||||
|
write_archive(packagesmetafile, output)
|
||||||
|
|
||||||
|
# Produce packages-meta-ext-v1.json.gz
|
||||||
|
if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
|
||||||
|
f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
|
||||||
|
data = f()
|
||||||
|
|
||||||
|
default_ = {"Groups": [], "License": [], "Keywords": []}
|
||||||
|
for i in range(len(output)):
|
||||||
|
data_ = data.get(output[i].get("ID"), default_)
|
||||||
|
output[i].update(data_)
|
||||||
|
|
||||||
|
write_archive(packagesmetaextfile, output)
|
||||||
|
|
||||||
|
# Produce packages.gz
|
||||||
|
with gzip.open(packagesfile, "wb") as f:
|
||||||
|
f.write(bytes(pkglist_header + "\n", "UTF-8"))
|
||||||
|
f.writelines([
|
||||||
|
bytes(x.get("Name") + "\n", "UTF-8")
|
||||||
|
for x in output
|
||||||
|
])
|
||||||
|
|
||||||
|
update_state(packages_state, update_time)
|
||||||
|
|
||||||
|
updated, update_time = should_update(pkgbases_state, "PackageBases")
|
||||||
|
if not updated:
|
||||||
|
print("Updating PackageBases...")
|
||||||
|
# Produce pkgbase.gz
|
||||||
with gzip.open(pkgbasefile, "w") as f:
|
with gzip.open(pkgbasefile, "w") as f:
|
||||||
f.write(bytes(pkgbaselist_header + "\n", "UTF-8"))
|
f.write(bytes(pkgbaselist_header + "\n", "UTF-8"))
|
||||||
cur = conn.execute("SELECT Name FROM PackageBases " +
|
cur = conn.execute("SELECT Name FROM PackageBases " +
|
||||||
"WHERE PackagerUID IS NOT NULL")
|
"WHERE PackagerUID IS NOT NULL")
|
||||||
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
|
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
|
||||||
|
update_state(pkgbases_state, update_time)
|
||||||
|
|
||||||
|
updated, update_time = should_update(users_state, "Users")
|
||||||
|
if not updated:
|
||||||
|
print("Updating Users...")
|
||||||
|
# Produce users.gz
|
||||||
with gzip.open(userfile, "w") as f:
|
with gzip.open(userfile, "w") as f:
|
||||||
f.write(bytes(userlist_header + "\n", "UTF-8"))
|
f.write(bytes(userlist_header + "\n", "UTF-8"))
|
||||||
cur = conn.execute("SELECT UserName FROM Users")
|
cur = conn.execute("SELECT UserName FROM Users")
|
||||||
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
|
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
|
||||||
|
update_state(users_state, update_time)
|
||||||
|
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
|
|
@ -103,6 +103,7 @@ server = https://mirrors.kernel.org/archlinux/%s/os/x86_64
|
||||||
[mkpkglists]
|
[mkpkglists]
|
||||||
packagesfile = /srv/http/aurweb/web/html/packages.gz
|
packagesfile = /srv/http/aurweb/web/html/packages.gz
|
||||||
packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz
|
packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz
|
||||||
|
packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz
|
||||||
pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz
|
pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz
|
||||||
userfile = /srv/http/aurweb/web/html/users.gz
|
userfile = /srv/http/aurweb/web/html/users.gz
|
||||||
|
|
||||||
|
|
|
@ -37,6 +37,7 @@ enable-maintenance = 0
|
||||||
maintenance-exceptions = 127.0.0.1
|
maintenance-exceptions = 127.0.0.1
|
||||||
commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s
|
commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s
|
||||||
localedir = $TOPLEVEL/web/locale/
|
localedir = $TOPLEVEL/web/locale/
|
||||||
|
snapshot_uri = /cgit/aur.git/snapshot/%s.tar.gz
|
||||||
|
|
||||||
[notifications]
|
[notifications]
|
||||||
notify-cmd = $NOTIFY
|
notify-cmd = $NOTIFY
|
||||||
|
@ -68,6 +69,7 @@ server = file://$(pwd)/remote/
|
||||||
[mkpkglists]
|
[mkpkglists]
|
||||||
packagesfile = packages.gz
|
packagesfile = packages.gz
|
||||||
packagesmetafile = packages-meta-v1.json.gz
|
packagesmetafile = packages-meta-v1.json.gz
|
||||||
|
packagesmetaextfile = packages-meta-ext-v1.json.gz
|
||||||
pkgbasefile = pkgbase.gz
|
pkgbasefile = pkgbase.gz
|
||||||
userfile = users.gz
|
userfile = users.gz
|
||||||
EOF
|
EOF
|
||||||
|
|
|
@ -189,7 +189,8 @@ if (!empty($tokens[1]) && '/' . $tokens[1] == get_pkg_route()) {
|
||||||
readfile("./$path");
|
readfile("./$path");
|
||||||
break;
|
break;
|
||||||
case "/packages.gz":
|
case "/packages.gz":
|
||||||
case "/packages-teapot.json.gz":
|
case "/packages-meta-v1.json.gz":
|
||||||
|
case "/packages-meta-ext-v1.json.gz":
|
||||||
case "/pkgbase.gz":
|
case "/pkgbase.gz":
|
||||||
case "/users.gz":
|
case "/users.gz":
|
||||||
header("Content-Type: text/plain");
|
header("Content-Type: text/plain");
|
||||||
|
|
Loading…
Add table
Reference in a new issue