mirror of
https://gitlab.archlinux.org/archlinux/aurweb.git
synced 2025-02-03 10:43:03 +01:00
bugfix: gendummydata.py was producing invalid usernames
As per our regex and policies, usernames should consist of ascii alphanumeric characters and possibly (-, _ or .). gendummydata.py was creating unicode versions of some usernames and adding them into the DB. With our newfound collations, this becomes a problem as it treats them as the same. This should have never been the case here, and so, gendummydata.py has been patched to normalize all of its usernames and package names. Signed-off-by: Kevin Morris <kevr@0cost.org>
This commit is contained in:
parent
83887b97df
commit
4f09e939ae
1 changed files with 18 additions and 9 deletions
|
@ -98,11 +98,19 @@ if MAX_USERS > len(contents):
|
|||
MAX_USERS = len(contents)
|
||||
if MAX_PKGS > len(contents):
|
||||
MAX_PKGS = len(contents)
|
||||
if len(contents) - MAX_USERS > MAX_PKGS:
|
||||
need_dupes = 0
|
||||
else:
|
||||
|
||||
need_dupes = 0
|
||||
if not len(contents) - MAX_USERS > MAX_PKGS:
|
||||
need_dupes = 1
|
||||
|
||||
|
||||
def normalize(unicode_data):
|
||||
""" We only accept ascii for usernames. Also use this to normalize
|
||||
package names; our database utf8mb4 collations compare with Unicode
|
||||
Equivalence. """
|
||||
return unicode_data.encode('ascii', 'ignore').decode('ascii')
|
||||
|
||||
|
||||
# select random usernames
|
||||
#
|
||||
log.debug("Generating random user names...")
|
||||
|
@ -110,12 +118,13 @@ user_id = USER_ID
|
|||
while len(seen_users) < MAX_USERS:
|
||||
user = random.randrange(0, len(contents))
|
||||
word = contents[user].replace("'", "").replace(".", "").replace(" ", "_")
|
||||
word = word.strip().lower()
|
||||
word = normalize(word.strip().lower())
|
||||
if word not in seen_users:
|
||||
seen_users[word] = user_id
|
||||
user_id += 1
|
||||
user_keys = list(seen_users.keys())
|
||||
|
||||
|
||||
# select random package names
|
||||
#
|
||||
log.debug("Generating random package names...")
|
||||
|
@ -123,7 +132,7 @@ num_pkgs = PKG_ID
|
|||
while len(seen_pkgs) < MAX_PKGS:
|
||||
pkg = random.randrange(0, len(contents))
|
||||
word = contents[pkg].replace("'", "").replace(".", "").replace(" ", "_")
|
||||
word = word.strip().lower()
|
||||
word = normalize(word.strip().lower())
|
||||
if not need_dupes:
|
||||
if word not in seen_pkgs and word not in seen_users:
|
||||
seen_pkgs[word] = num_pkgs
|
||||
|
@ -285,10 +294,10 @@ for p in seen_pkgs_keys:
|
|||
for i in range(num_sources):
|
||||
src_file = user_keys[random.randrange(0, len(user_keys))]
|
||||
src = "%s%s.%s/%s/%s-%s.tar.gz" % (
|
||||
RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
|
||||
p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
|
||||
RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
|
||||
src_file, genVersion())
|
||||
RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
|
||||
p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
|
||||
RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
|
||||
src_file, genVersion())
|
||||
s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
|
||||
s = s % (seen_pkgs[p], src)
|
||||
out.write(s)
|
||||
|
|
Loading…
Add table
Reference in a new issue