From 4f09e939ae1d605f6568a180d8ab86033e847a0f Mon Sep 17 00:00:00 2001
From: Kevin Morris <kevr@0cost.org>
Date: Sun, 6 Jun 2021 21:34:42 -0700
Subject: [PATCH] bugfix: gendummydata.py was producing invalid usernames

As per our regex and policies, usernames should consist of
ascii alphanumeric characters and possibly (-, _ or .).

gendummydata.py was creating unicode versions of some
usernames and adding them into the DB. With our newfound
collations, this becomes a problem as it treats them as
the same.

This should have never been the case here, and so,
gendummydata.py has been patched to normalize all of its
usernames and package names.

Signed-off-by: Kevin Morris <kevr@0cost.org>
---
 schema/gendummydata.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/schema/gendummydata.py b/schema/gendummydata.py
index c7b3a06d..35805d6c 100755
--- a/schema/gendummydata.py
+++ b/schema/gendummydata.py
@@ -98,11 +98,19 @@ if MAX_USERS > len(contents):
     MAX_USERS = len(contents)
 if MAX_PKGS > len(contents):
     MAX_PKGS = len(contents)
-if len(contents) - MAX_USERS > MAX_PKGS:
-    need_dupes = 0
-else:
+
+need_dupes = 0
+if not len(contents) - MAX_USERS > MAX_PKGS:
     need_dupes = 1
 
+
+def normalize(unicode_data):
+    """ We only accept ascii for usernames. Also use this to normalize
+    package names; our database utf8mb4 collations compare with Unicode
+    Equivalence. """
+    return unicode_data.encode('ascii', 'ignore').decode('ascii')
+
+
 # select random usernames
 #
 log.debug("Generating random user names...")
@@ -110,12 +118,13 @@ user_id = USER_ID
 while len(seen_users) < MAX_USERS:
     user = random.randrange(0, len(contents))
     word = contents[user].replace("'", "").replace(".", "").replace(" ", "_")
-    word = word.strip().lower()
+    word = normalize(word.strip().lower())
     if word not in seen_users:
         seen_users[word] = user_id
         user_id += 1
 user_keys = list(seen_users.keys())
 
+
 # select random package names
 #
 log.debug("Generating random package names...")
@@ -123,7 +132,7 @@ num_pkgs = PKG_ID
 while len(seen_pkgs) < MAX_PKGS:
     pkg = random.randrange(0, len(contents))
     word = contents[pkg].replace("'", "").replace(".", "").replace(" ", "_")
-    word = word.strip().lower()
+    word = normalize(word.strip().lower())
     if not need_dupes:
         if word not in seen_pkgs and word not in seen_users:
             seen_pkgs[word] = num_pkgs
@@ -285,10 +294,10 @@ for p in seen_pkgs_keys:
     for i in range(num_sources):
         src_file = user_keys[random.randrange(0, len(user_keys))]
         src = "%s%s.%s/%s/%s-%s.tar.gz" % (
-                RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
-                p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
-                RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
-                src_file, genVersion())
+            RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
+            p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
+            RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
+            src_file, genVersion())
         s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
         s = s % (seen_pkgs[p], src)
         out.write(s)