mirror of
https://gitlab.archlinux.org/archlinux/aurweb.git
synced 2025-02-03 10:43:03 +01:00
fix(mkpkglists): improve package meta archive
The SQL logic in this file for package metadata now exactly reflects RPC's search logic, without searching for specific packages. Two command line arguments are available: --extended | Include License, Keywords, Groups, relations and dependencies. When --extended is passed, the script will create a packages-meta-ext-v1.json.gz, configured via packagesmetaextfile. Archive JSON is in the following format: line-separated package objects enclosed in a list: [ {...}, {...}, {...} ] Signed-off-by: Kevin Morris <kevr@0cost.org>
This commit is contained in:
parent
69773a5b58
commit
51fb24ab73
6 changed files with 258 additions and 26 deletions
|
@ -12,7 +12,7 @@ before_script:
|
|||
python-pygit2 python-srcinfo python-bleach python-markdown
|
||||
python-sqlalchemy python-alembic python-pytest python-werkzeug
|
||||
python-pytest-tap python-fastapi hypercorn nginx python-authlib
|
||||
python-itsdangerous python-httpx
|
||||
python-itsdangerous python-httpx python-orjson
|
||||
|
||||
test:
|
||||
script:
|
||||
|
|
3
INSTALL
3
INSTALL
|
@ -49,7 +49,8 @@ read the instructions below.
|
|||
|
||||
# pacman -S python-mysql-connector python-pygit2 python-srcinfo python-sqlalchemy \
|
||||
python-bleach python-markdown python-alembic python-jinja \
|
||||
python-itsdangerous python-authlib python-httpx hypercorn
|
||||
python-itsdangerous python-authlib python-httpx hypercorn \
|
||||
python-orjson
|
||||
# python3 setup.py install
|
||||
|
||||
5) Create a new MySQL database and a user and import the aurweb SQL schema:
|
||||
|
|
|
@ -1,16 +1,192 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Produces package, package base and user archives for the AUR
|
||||
database.
|
||||
|
||||
Archives:
|
||||
|
||||
packages.gz | A line-separated list of package names
|
||||
packages-meta-v1.json | A type=search RPC-formatted JSON dataset
|
||||
packages-meta-ext-v1.json | An --extended archive
|
||||
pkgbase.gz | A line-separated list of package base names
|
||||
users.gz | A line-separated list of user names
|
||||
|
||||
This script takes an optional argument: --extended. Based
|
||||
on the following, right-hand side fields are added to each item.
|
||||
|
||||
--extended | License, Keywords, Groups, relations and dependencies
|
||||
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
from decimal import Decimal
|
||||
from typing import Tuple
|
||||
|
||||
import orjson
|
||||
|
||||
import aurweb.config
|
||||
import aurweb.db
|
||||
|
||||
|
||||
def state_path(archive: str) -> str:
|
||||
# A hard-coded /tmp state directory.
|
||||
# TODO: Use Redis cache to store this state after we merge
|
||||
# FastAPI into master and removed PHP from the tree.
|
||||
return os.path.join("/tmp", os.path.basename(archive) + ".state")
|
||||
|
||||
|
||||
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
|
||||
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
|
||||
packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
|
||||
packages_state = state_path(packagesfile)
|
||||
|
||||
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
|
||||
pkgbases_state = state_path(pkgbasefile)
|
||||
|
||||
userfile = aurweb.config.get('mkpkglists', 'userfile')
|
||||
users_state = state_path(userfile)
|
||||
|
||||
|
||||
def should_update(state: str, tablename: str) -> Tuple[bool, int]:
|
||||
if aurweb.config.get("database", "backend") != "mysql":
|
||||
return (False, 0)
|
||||
|
||||
db_name = aurweb.config.get("database", "name")
|
||||
conn = aurweb.db.Connection()
|
||||
cur = conn.execute("SELECT auto_increment FROM information_schema.tables "
|
||||
"WHERE table_schema = ? AND table_name = ?",
|
||||
(db_name, tablename,))
|
||||
update_time = cur.fetchone()[0]
|
||||
|
||||
saved_update_time = 0
|
||||
if os.path.exists(state):
|
||||
with open(state) as f:
|
||||
saved_update_time = int(f.read().strip())
|
||||
|
||||
return (saved_update_time == update_time, update_time)
|
||||
|
||||
|
||||
def update_state(state: str, update_time: int) -> None:
|
||||
with open(state, "w") as f:
|
||||
f.write(str(update_time))
|
||||
|
||||
|
||||
TYPE_MAP = {
|
||||
"depends": "Depends",
|
||||
"makedepends": "MakeDepends",
|
||||
"checkdepends": "CheckDepends",
|
||||
"optdepends": "OptDepends",
|
||||
"conflicts": "Conflicts",
|
||||
"provides": "Provides",
|
||||
"replaces": "Replaces",
|
||||
}
|
||||
|
||||
|
||||
def get_extended_dict(query: str):
|
||||
"""
|
||||
Produce data in the form in a single bulk SQL query:
|
||||
|
||||
{
|
||||
<integer_package_id>: {
|
||||
"Depends": [...],
|
||||
"Conflicts": [...],
|
||||
"License": [...]
|
||||
}
|
||||
}
|
||||
|
||||
The caller can then use this data to populate a dataset of packages.
|
||||
|
||||
output = produce_base_output_data()
|
||||
data = get_extended_dict(query)
|
||||
for i in range(len(output)):
|
||||
package_id = output[i].get("ID")
|
||||
output[i].update(data.get(package_id))
|
||||
"""
|
||||
|
||||
conn = aurweb.db.Connection()
|
||||
|
||||
cursor = conn.execute(query)
|
||||
|
||||
data = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
for result in cursor.fetchall():
|
||||
|
||||
pkgid = result[0]
|
||||
key = TYPE_MAP.get(result[1])
|
||||
output = result[2]
|
||||
if result[3]:
|
||||
output += result[3]
|
||||
|
||||
# In all cases, we have at least an empty License list.
|
||||
if "License" not in data[pkgid]:
|
||||
data[pkgid]["License"] = []
|
||||
|
||||
# In all cases, we have at least an empty Keywords list.
|
||||
if "Keywords" not in data[pkgid]:
|
||||
data[pkgid]["Keywords"] = []
|
||||
|
||||
data[pkgid][key].append(output)
|
||||
|
||||
conn.close()
|
||||
return data
|
||||
|
||||
|
||||
def get_extended_fields():
|
||||
# Returns: [ID, Type, Name, Cond]
|
||||
query = """
|
||||
SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type,
|
||||
PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond
|
||||
FROM PackageDepends
|
||||
LEFT JOIN DependencyTypes
|
||||
ON DependencyTypes.ID = PackageDepends.DepTypeID
|
||||
UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type,
|
||||
PackageRelations.RelName AS Name,
|
||||
PackageRelations.RelCondition AS Cond
|
||||
FROM PackageRelations
|
||||
LEFT JOIN RelationTypes
|
||||
ON RelationTypes.ID = PackageRelations.RelTypeID
|
||||
UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
|
||||
Groups.Name, '' AS Cond
|
||||
FROM Groups
|
||||
INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID
|
||||
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
|
||||
Licenses.Name, '' as Cond
|
||||
FROM Licenses
|
||||
INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
|
||||
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
|
||||
PackageKeywords.Keyword AS Name, '' as Cond
|
||||
FROM PackageKeywords
|
||||
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
|
||||
"""
|
||||
return get_extended_dict(query)
|
||||
|
||||
|
||||
EXTENDED_FIELD_HANDLERS = {
|
||||
"--extended": get_extended_fields
|
||||
}
|
||||
|
||||
|
||||
def is_decimal(column):
|
||||
""" Check if an SQL column is of decimal.Decimal type. """
|
||||
if isinstance(column, Decimal):
|
||||
return float(column)
|
||||
return column
|
||||
|
||||
|
||||
def write_archive(archive: str, output: list):
|
||||
with gzip.open(archive, "wb") as f:
|
||||
f.write(b"[\n")
|
||||
for i, item in enumerate(output):
|
||||
f.write(orjson.dumps(item))
|
||||
if i < len(output) - 1:
|
||||
f.write(b",")
|
||||
f.write(b"\n")
|
||||
f.write(b"]")
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -21,32 +197,83 @@ def main():
|
|||
pkgbaselist_header = "# AUR package base list, generated on " + datestr
|
||||
userlist_header = "# AUR user name list, generated on " + datestr
|
||||
|
||||
with gzip.open(packagesfile, "w") as f:
|
||||
f.write(bytes(pkglist_header + "\n", "UTF-8"))
|
||||
cur = conn.execute("SELECT Packages.Name FROM Packages " +
|
||||
"INNER JOIN PackageBases " +
|
||||
"ON PackageBases.ID = Packages.PackageBaseID " +
|
||||
updated, update_time = should_update(packages_state, "Packages")
|
||||
if not updated:
|
||||
print("Updating Packages...")
|
||||
|
||||
# Query columns; copied from RPC.
|
||||
columns = ("Packages.ID, Packages.Name, "
|
||||
"PackageBases.ID AS PackageBaseID, "
|
||||
"PackageBases.Name AS PackageBase, "
|
||||
"Version, Description, URL, NumVotes, "
|
||||
"Popularity, OutOfDateTS AS OutOfDate, "
|
||||
"Users.UserName AS Maintainer, "
|
||||
"SubmittedTS AS FirstSubmitted, "
|
||||
"ModifiedTS AS LastModified")
|
||||
|
||||
# Perform query.
|
||||
cur = conn.execute(f"SELECT {columns} FROM Packages "
|
||||
"LEFT JOIN PackageBases "
|
||||
"ON PackageBases.ID = Packages.PackageBaseID "
|
||||
"LEFT JOIN Users "
|
||||
"ON PackageBases.MaintainerUID = Users.ID "
|
||||
"WHERE PackageBases.PackagerUID IS NOT NULL")
|
||||
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
|
||||
|
||||
with gzip.open(packagesmetafile, "wt") as f:
|
||||
cur = conn.execute("SELECT * FROM Packages")
|
||||
json.dump({
|
||||
"warning": "This is a experimental! It can be removed or modified without warning!",
|
||||
"columns": [d[0] for d in cur.description],
|
||||
"data": cur.fetchall()
|
||||
}, f)
|
||||
# Produce packages-meta-v1.json.gz
|
||||
output = list()
|
||||
snapshot_uri = aurweb.config.get("options", "snapshot_uri")
|
||||
for result in cur.fetchall():
|
||||
item = {
|
||||
column[0]: is_decimal(result[i])
|
||||
for i, column in enumerate(cur.description)
|
||||
}
|
||||
item["URLPath"] = snapshot_uri % item.get("Name")
|
||||
output.append(item)
|
||||
|
||||
write_archive(packagesmetafile, output)
|
||||
|
||||
# Produce packages-meta-ext-v1.json.gz
|
||||
if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
|
||||
f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
|
||||
data = f()
|
||||
|
||||
default_ = {"Groups": [], "License": [], "Keywords": []}
|
||||
for i in range(len(output)):
|
||||
data_ = data.get(output[i].get("ID"), default_)
|
||||
output[i].update(data_)
|
||||
|
||||
write_archive(packagesmetaextfile, output)
|
||||
|
||||
# Produce packages.gz
|
||||
with gzip.open(packagesfile, "wb") as f:
|
||||
f.write(bytes(pkglist_header + "\n", "UTF-8"))
|
||||
f.writelines([
|
||||
bytes(x.get("Name") + "\n", "UTF-8")
|
||||
for x in output
|
||||
])
|
||||
|
||||
update_state(packages_state, update_time)
|
||||
|
||||
updated, update_time = should_update(pkgbases_state, "PackageBases")
|
||||
if not updated:
|
||||
print("Updating PackageBases...")
|
||||
# Produce pkgbase.gz
|
||||
with gzip.open(pkgbasefile, "w") as f:
|
||||
f.write(bytes(pkgbaselist_header + "\n", "UTF-8"))
|
||||
cur = conn.execute("SELECT Name FROM PackageBases " +
|
||||
"WHERE PackagerUID IS NOT NULL")
|
||||
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
|
||||
update_state(pkgbases_state, update_time)
|
||||
|
||||
updated, update_time = should_update(users_state, "Users")
|
||||
if not updated:
|
||||
print("Updating Users...")
|
||||
# Produce users.gz
|
||||
with gzip.open(userfile, "w") as f:
|
||||
f.write(bytes(userlist_header + "\n", "UTF-8"))
|
||||
cur = conn.execute("SELECT UserName FROM Users")
|
||||
f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
|
||||
update_state(users_state, update_time)
|
||||
|
||||
conn.close()
|
||||
|
||||
|
|
|
@ -93,5 +93,6 @@ server = ftp://mirrors.kernel.org/archlinux/%s/os/x86_64
|
|||
[mkpkglists]
|
||||
packagesfile = /srv/http/aurweb/web/html/packages.gz
|
||||
packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz
|
||||
packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz
|
||||
pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz
|
||||
userfile = /srv/http/aurweb/web/html/users.gz
|
||||
|
|
|
@ -31,6 +31,7 @@ enable-maintenance = 0
|
|||
maintenance-exceptions = 127.0.0.1
|
||||
commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s
|
||||
localedir = $TOPLEVEL/web/locale/
|
||||
snapshot_uri = /cgit/aur.git/snapshot/%s.tar.gz
|
||||
|
||||
[notifications]
|
||||
notify-cmd = $NOTIFY
|
||||
|
@ -62,6 +63,7 @@ server = file://$(pwd)/remote/
|
|||
[mkpkglists]
|
||||
packagesfile = packages.gz
|
||||
packagesmetafile = packages-meta-v1.json.gz
|
||||
packagesmetaextfile = packages-meta-ext-v1.json.gz
|
||||
pkgbasefile = pkgbase.gz
|
||||
userfile = users.gz
|
||||
EOF
|
||||
|
|
|
@ -189,7 +189,8 @@ if (!empty($tokens[1]) && '/' . $tokens[1] == get_pkg_route()) {
|
|||
readfile("./$path");
|
||||
break;
|
||||
case "/packages.gz":
|
||||
case "/packages-teapot.json.gz":
|
||||
case "/packages-meta-v1.json.gz":
|
||||
case "/packages-meta-ext-v1.json.gz":
|
||||
case "/pkgbase.gz":
|
||||
case "/users.gz":
|
||||
header("Content-Type: text/plain");
|
||||
|
|
Loading…
Add table
Reference in a new issue