From 4f09e939ae1d605f6568a180d8ab86033e847a0f Mon Sep 17 00:00:00 2001 From: Kevin Morris Date: Sun, 6 Jun 2021 21:34:42 -0700 Subject: [PATCH] bugfix: gendummydata.py was producing invalid usernames As per our regex and policies, usernames should consist of ascii alphanumeric characters and possibly (-, _ or .). gendummydata.py was creating unicode versions of some usernames and adding them into the DB. With our newfound collations, this becomes a problem as it treats them as the same. This should have never been the case here, and so, gendummydata.py has been patched to normalize all of its usernames and package names. Signed-off-by: Kevin Morris --- schema/gendummydata.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/schema/gendummydata.py b/schema/gendummydata.py index c7b3a06d..35805d6c 100755 --- a/schema/gendummydata.py +++ b/schema/gendummydata.py @@ -98,11 +98,19 @@ if MAX_USERS > len(contents): MAX_USERS = len(contents) if MAX_PKGS > len(contents): MAX_PKGS = len(contents) -if len(contents) - MAX_USERS > MAX_PKGS: - need_dupes = 0 -else: + +need_dupes = 0 +if not len(contents) - MAX_USERS > MAX_PKGS: need_dupes = 1 + +def normalize(unicode_data): + """ We only accept ascii for usernames. Also use this to normalize + package names; our database utf8mb4 collations compare with Unicode + Equivalence. """ + return unicode_data.encode('ascii', 'ignore').decode('ascii') + + # select random usernames # log.debug("Generating random user names...") @@ -110,12 +118,13 @@ user_id = USER_ID while len(seen_users) < MAX_USERS: user = random.randrange(0, len(contents)) word = contents[user].replace("'", "").replace(".", "").replace(" ", "_") - word = word.strip().lower() + word = normalize(word.strip().lower()) if word not in seen_users: seen_users[word] = user_id user_id += 1 user_keys = list(seen_users.keys()) + # select random package names # log.debug("Generating random package names...") @@ -123,7 +132,7 @@ num_pkgs = PKG_ID while len(seen_pkgs) < MAX_PKGS: pkg = random.randrange(0, len(contents)) word = contents[pkg].replace("'", "").replace(".", "").replace(" ", "_") - word = word.strip().lower() + word = normalize(word.strip().lower()) if not need_dupes: if word not in seen_pkgs and word not in seen_users: seen_pkgs[word] = num_pkgs @@ -285,10 +294,10 @@ for p in seen_pkgs_keys: for i in range(num_sources): src_file = user_keys[random.randrange(0, len(user_keys))] src = "%s%s.%s/%s/%s-%s.tar.gz" % ( - RANDOM_URL[random.randrange(0, len(RANDOM_URL))], - p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))], - RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))], - src_file, genVersion()) + RANDOM_URL[random.randrange(0, len(RANDOM_URL))], + p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))], + RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))], + src_file, genVersion()) s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n" s = s % (seen_pkgs[p], src) out.write(s)