bugfix: gendummydata.py was producing invalid usernames

As per our regex and policies, usernames should consist of ascii alphanumeric characters and possibly (-, _ or .). gendummydata.py was creating unicode versions of some usernames and adding them into the DB. With our newfound collations, this becomes a problem as it treats them as the same. This should have never been the case here, and so, gendummydata.py has been patched to normalize all of its usernames and package names. Signed-off-by: Kevin Morris <kevr@0cost.org>
2025-02-03 10:43:03 +01:00 · 2021-06-06 21:34:42 -07:00 · 2021-06-06 21:34:42 -07:00 · 4f09e939ae
commit 4f09e939ae
parent 83887b97df
1 changed files with 18 additions and 9 deletions
--- a/schema/gendummydata.py
+++ b/schema/gendummydata.py
@ -98,11 +98,19 @@ if MAX_USERS > len(contents):
    MAX_USERS = len(contents)
 if MAX_PKGS > len(contents):
    MAX_PKGS = len(contents)
-if len(contents) - MAX_USERS > MAX_PKGS:
-    need_dupes = 0
-else:
+
+need_dupes = 0
+if not len(contents) - MAX_USERS > MAX_PKGS:
    need_dupes = 1

+
+def normalize(unicode_data):
+    """ We only accept ascii for usernames. Also use this to normalize
+    package names; our database utf8mb4 collations compare with Unicode
+    Equivalence. """
+    return unicode_data.encode('ascii', 'ignore').decode('ascii')
+
+
 # select random usernames
 #
 log.debug("Generating random user names...")
@ -110,12 +118,13 @@ user_id = USER_ID
 while len(seen_users) < MAX_USERS:
    user = random.randrange(0, len(contents))
    word = contents[user].replace("'", "").replace(".", "").replace(" ", "_")
-    word = word.strip().lower()
+    word = normalize(word.strip().lower())
    if word not in seen_users:
        seen_users[word] = user_id
        user_id += 1
 user_keys = list(seen_users.keys())

+
 # select random package names
 #
 log.debug("Generating random package names...")
@ -123,7 +132,7 @@ num_pkgs = PKG_ID
 while len(seen_pkgs) < MAX_PKGS:
    pkg = random.randrange(0, len(contents))
    word = contents[pkg].replace("'", "").replace(".", "").replace(" ", "_")
-    word = word.strip().lower()
+    word = normalize(word.strip().lower())
    if not need_dupes:
        if word not in seen_pkgs and word not in seen_users:
            seen_pkgs[word] = num_pkgs
@ -285,10 +294,10 @@ for p in seen_pkgs_keys:
    for i in range(num_sources):
        src_file = user_keys[random.randrange(0, len(user_keys))]
        src = "%s%s.%s/%s/%s-%s.tar.gz" % (
-                RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
-                p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
-                RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
-                src_file, genVersion())
+            RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
+            p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
+            RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
+            src_file, genVersion())
        s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
        s = s % (seen_pkgs[p], src)
        out.write(s)