diff --git a/schema/gendummydata.py b/schema/gendummydata.py index c7b3a06d..35805d6c 100755 --- a/schema/gendummydata.py +++ b/schema/gendummydata.py @@ -98,11 +98,19 @@ if MAX_USERS > len(contents): MAX_USERS = len(contents) if MAX_PKGS > len(contents): MAX_PKGS = len(contents) -if len(contents) - MAX_USERS > MAX_PKGS: - need_dupes = 0 -else: + +need_dupes = 0 +if not len(contents) - MAX_USERS > MAX_PKGS: need_dupes = 1 + +def normalize(unicode_data): + """ We only accept ascii for usernames. Also use this to normalize + package names; our database utf8mb4 collations compare with Unicode + Equivalence. """ + return unicode_data.encode('ascii', 'ignore').decode('ascii') + + # select random usernames # log.debug("Generating random user names...") @@ -110,12 +118,13 @@ user_id = USER_ID while len(seen_users) < MAX_USERS: user = random.randrange(0, len(contents)) word = contents[user].replace("'", "").replace(".", "").replace(" ", "_") - word = word.strip().lower() + word = normalize(word.strip().lower()) if word not in seen_users: seen_users[word] = user_id user_id += 1 user_keys = list(seen_users.keys()) + # select random package names # log.debug("Generating random package names...") @@ -123,7 +132,7 @@ num_pkgs = PKG_ID while len(seen_pkgs) < MAX_PKGS: pkg = random.randrange(0, len(contents)) word = contents[pkg].replace("'", "").replace(".", "").replace(" ", "_") - word = word.strip().lower() + word = normalize(word.strip().lower()) if not need_dupes: if word not in seen_pkgs and word not in seen_users: seen_pkgs[word] = num_pkgs @@ -285,10 +294,10 @@ for p in seen_pkgs_keys: for i in range(num_sources): src_file = user_keys[random.randrange(0, len(user_keys))] src = "%s%s.%s/%s/%s-%s.tar.gz" % ( - RANDOM_URL[random.randrange(0, len(RANDOM_URL))], - p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))], - RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))], - src_file, genVersion()) + RANDOM_URL[random.randrange(0, len(RANDOM_URL))], + p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))], + RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))], + src_file, genVersion()) s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n" s = s % (seen_pkgs[p], src) out.write(s)