Scrubbing User Data

We want to ensure that the data that we analyze is scrubbed so that no trace-able information is related to individual data points.

Import Dependencies

# import dependencies
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as sci
import pandas as pd
from pathlib import Path
import uuid
from IPython.display import display, HTML


def disp(df):
    display(HTML(df.to_html()))

Load our raw data in the form of CSV into data frames. Raw data should be located in ./raw-data/.

file_name = input("What is the name of the csv file?")
file_path = "./raw-data/" + file_name
df = pd.read_csv(file_path)

Prune duplicate emails from the dataset. This is an easy way to know if someone has submitted multiple responses

col_name = (
    "(Optional) Provide your email for a chance to win a $20.00 Tim Horton's gift card"
)

df.drop_duplicates(
    subset=[col_name],
    keep="last",
)

	Timestamp	What do you think is an acceptable amount of time to wait for services at your pharmacy?	How long do you usually wait at the pharmacy?	Have you used Amazon Lockers or a similar pick-up Lockers for before?	How likely are you to use an automated pick-up locker to pick up your prescriptions?	Email Address	What is your age range?	(Optional) Provide your email for a chance to win a $20.00 Tim Horton's gift card	How often do you usually visit pharmacies in a given year	Think about the last time you went to the pharmacy. How would you best describe your experience?
2	10/5/2022 18:48:45	11-15 minutes	0-3 minutes	No	Somewhat likely	NaN	Under 21	zoe.cushman@protonmail.com	NaN	NaN
3	10/5/2022 19:10:21	4-6 minutes	more than 15 minutes	No	Very likely	NaN	21 - 35	nbudatho@uwaterloo.ca	NaN	NaN
4	10/5/2022 19:20:25	4-6 minutes	7-9 minutes	No	Not very likely	NaN	21 - 35	glmdenney17@gmail.com	NaN	NaN
5	10/5/2022 19:25:40	4-6 minutes	0-3 minutes	No	Likely	NaN	21 - 35	m.balghonaim@gmail.com	NaN	NaN
6	10/5/2022 19:32:27	4-6 minutes	7-9 minutes	No	Very likely	NaN	21 - 35	jjwilkin@uwaterloo.ca	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
2831	10/17/2022 1:07:53	7-9 minutes	11-15 minutes	Yes	Likely	NaN	40 - 64	anunley661@gmail.com	NaN	NaN
2832	10/17/2022 1:08:06	7-9 minutes	11-15 minutes	Yes	Likely	NaN	40 - 64	alantheisen106@gmail.com	NaN	NaN
2833	10/17/2022 1:09:47	4-6 minutes	7-9 minutes	Yes	Likely	NaN	35 - 40	rjudson123@gmail.com	NaN	NaN
2834	10/17/2022 1:10:11	4-6 minutes	7-9 minutes	Yes	Likely	NaN	35 - 40	hsylvester837@gmail.com	NaN	NaN
2835	10/17/2022 1:11:43	4-6 minutes	7-9 minutes	Yes	Likely	NaN	35 - 40	ddominy87@gmail.com	NaN	NaN

2706 rows × 10 columns

Collect the emails and store them separate from data

emails = pd.DataFrame(df[col_name])

# rename columns for sanity
emails.rename(
    columns={col_name: "email"},
    inplace=True,
)
# drop duplicates
emails.drop_duplicates()

# drop blanks
emails = emails.dropna()

# scamble emails so that are randomly out of order
emails = emails.sample(frac=1).reset_index(drop=True)

# write file to .csv
email_output_file_name = "emails_for_reward.csv"
filepath = Path("./output/" + email_output_file_name)
filepath.parent.mkdir(parents=True, exist_ok=True)
emails.to_csv(filepath)

With our emails saved, we can now scrubb our data and clean it up so we can use it for analysis

# shuffle the rows and re-index
df = df.sample(frac=1).reset_index(drop=True)

# drop columns containing tracable information
df = df.drop(
    columns=[
        col_name,
        "Email Address",
        "Timestamp",
        "How often do you usually visit pharmacies in a given year",
        "Think about the last time you went to the pharmacy. How would you best describe your experience?",
    ]
)


# add a UUID identifier to each row
df.insert(0, "id", "")
df["id"] = [uuid.uuid4() for _ in range(len(df.index))]

Save the data to /output as patient-survey-data.csv

# write file to .csv
data_file_name = "patient-survey-data.csv"
filepath = Path("./output/" + data_file_name)
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath)