Chapter 7 Data Quality

Data quality requires a certain level of sophistication within a company to even understand that it’s a problem
Colleen Graham

7.1 Data Profiling

7.1.1 Find duplicates in single column

Description
Method to find duplicate values in a single column of a dataframe column
Ingredients
Package Data

readr
janitor

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name)


Actual Instructions

janitor::get_dupes(df, country)

7.1.2 Find duplicates in multiple columns

Description
Method to find duplicate values across multiple columns of a dataframe column
Ingredients
Package Data

readr
janitor

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name1, column_name2)


Actual Instructions

janitor::get_dupes(df, country, capital)

7.2 Data Cleaning

7.2.1 Clean common header naming issues

Description
Replaces common data header naming conversion issues; such as spaces, special characters, and duplicate names
Ingredients
Package Data

readr
janitor

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data)


Actual Instructions

janitor::clean_names(claims)

7.2.2 Remove empty columns

Description
Removes columns that have no values and are completely empty
Ingredients
Package Data

readr
janitor

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data)


Actual Instructions

janitor::remove_empty(claims)

7.2.3 Remove empty rows

Description
Removes rows that have no values and are completely empty
Ingredients
Package Data

readr
janitor

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data)


Actual Instructions

janitor::remove_empty_rows(claims)

7.2.4 Remove empty rows and columns

Description
Removes rows and columns that have no values and are completely empty
Ingredients
Package Data

readr
janitor

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, which = c("rows", "cols")


Actual Instructions

janitor::remove_empty(claims, which = c("rows", "cols")

7.2.5 Convert null to specific character value in a single column

Description
Method to change all null to a specific character value in a single character column of a dataframe
Ingredients
Package Data

readr
tidyr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, list(column_name = "text"))


Actual Instructions

tidyr::replace_na(df, list(country = "unknown"))

7.2.6 Convert null to specific numeric value in a single column

Description
Method to change all null to a specific numeric value in a single numeric column of a dataframe
Ingredients
Package Data

readr
tidyr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, list(column_name = number))


Actual Instructions

tidyr::replace_na(df, list(population = -999))

7.2.7 Remove leading whitespace from a character values in a single column

Description
Method to remove whitespace at the beginning of a character value in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name, side = c("right")))


Actual Instructions

dplyr::mutate(df, `Municipal Unit` = stringr::str_trim(`Municipal Unit`, side = c("right")))

7.2.8 Remove trailing whitespace from a character values in a single column

Description
Method to remove whitespace at the end of a character value in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name, side = c("left")))


Actual Instructions

dplyr::mutate(df, `Municipal Unit` = stringr::str_trim(`Municipal Unit`, side = c("left")))

7.2.9 Remove leading and trailing whitespace from a character values in a single column

Description
Method to remove whitespace at the beginning and end of a character value in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name, side = c("both")))


Actual Instructions

dplyr::mutate(df, `Municipal Unit` = stringr::str_trim(`Municipal Unit`, side = c("both")))

7.2.10 Change all characters to lower case in a single column

Description
Method to change all character values to lower case in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name))


Actual Instructions

dplyr::mutate(x, `Civic Street Name` = stringr::str_to_lower(`Civic Street Name`))

7.2.11 Change all characters to upper case in a single column

Description
Method to change all character values to upper case in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name))


Actual Instructions

dplyr::mutate(x, `Civic Street Name` = stringr::str_to_upper(`Civic Street Name`))

7.2.12 Change all characters to title case in a single column

Description
Method to change all character values to title case in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name))


Actual Instructions

dplyr::mutate(x, `Civic Street Name` = stringr::str_to_title(`Civic Street Name`))

7.2.13 Change all characters to sentence case in a single column

Description
Method to change all character values to sentence case in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name))


Actual Instructions

dplyr::mutate(x, `Civic Street Name` = stringr::str_to_sentence(`Civic Street Name`))

7.2.14 Remove punctuation from a single character column

Description
Method to remove punctuation including special characters from a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name, "[[:punct:]]", ""))


Actual Instructions

dplyr::mutate(df, `Municipal Unit` = stringr::str_replace_all(`Municipal Unit`, "[[:punct:]]", ""))

7.2.15 Change all spaces to underscores in a single character column

Description
Method to change all spaces to an underscore in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, column_name = package::function(column_name, " ", "_"))


Actual Instructions

dplyr::mutate(df, `Municipal Unit` = str_replace_all(`Municipal Unit`, " ", "_"))

7.2.16 Convert null with specific value in all character columns

Description
Method to change all null to a specific character value in a all character columns of a dataframe
Ingredients
Package Data

readr
dplyr
tidyr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, is.character, package::function, replace = "text")


Actual Instructions

dplyr::mutate_if(df, is.character, tidyr::replace_na, replace = "unknown")

7.2.17 Convert null with specific value in all numeric columns

Description
Method to change all null to a specific numeric value in a all numeric columns of a dataframe
Ingredients
Package Data

readr
dplyr
tidyr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, is.numeric, package::function, replace = number)


Actual Instructions

dplyr::mutate_if(x, is.numeric, tidyr::replace_na, replace = -999)

7.2.18 Remove whitespace in all character columns

Description
Method to remove whitespace at the beginning and end of a character value in all character columns of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, package::function(where(is.character), package::function))


Actual Instructions

dplyr::mutate(df, dplyr::across(where(is.character), stringr::str_trim))

7.2.19 Change all characters to lower case in all character columns

Description
Method to change all character values to lower case in all character columns of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, package::function(where(is.character), package::function))


Actual Instructions

dplyr::mutate(x, dplyr::across(where(is.character), stringr::str_to_lower))

7.2.20 Change all characters to upper case in all character columns

Description
Method to change all character values to upper case in all character columns of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, package::function(where(is.character), package::function))


Actual Instructions

dplyr::mutate(x, dplyr::across(where(is.character), stringr::str_to_upper))

7.2.21 Change all characters to title case in all character columns

Description
Method to change all character values to title case in all character columns of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, package::function(where(is.character), package::function))


Actual Instructions

dplyr::mutate(x, dplyr::across(where(is.character), stringr::str_to_title))

7.2.22 Change all characters to sentence case in all character columns

Description
Method to change all character values to sentence case in all character columns of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, package::function(where(is.character), package::function))


Actual Instructions

dplyr::mutate(x, dplyr::across(where(is.character), stringr::str_to_sentence))

7.2.23 Remove punctuation from all character columns

Description
Method to remove punctuation including special characters from all character columns of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, package::function(package::function(., "[[:punct:]]", "")))


Actual Instructions

dplyr::mutate_all(df, dplyr::funs(stringr::str_replace_all(., "[[:punct:]]", "")))

7.2.24 Change all spaces to underscores in all character columns

Description
Method to change all spaces to an underscore in a single character column of a dataframe
Ingredients
Package Data

readr
dplyr
stringr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

package::function(data, package::function(package::function(., " ", "_")))


Actual Instructions

dplyr::mutate_all(df, dplyr::funs(stringr::str_replace_all(., " ", "_")))

7.3 Combining Commands

7.3.1 Clean structure and values of a dataframe

Description
Method to cleann the structure and values in all character and numeric columns of a dataframe
Ingredients
Package Data

readr
dplyr
janitor
stringr
tidyr

sample.csv


Preparation

df <- readr::read_csv("C:/data/sample.csv")


Sample Instructions

data <- data %>%
  package::function() %>%
  package::function() %>%
  package::function(is.character, package::function, replace = "text") %>%
  package::function(is.numeric, package::function, replace = number) %>%
  package::function(package::function(where(is.character), package::function)) %>%
  package::function(package::function(where(is.character), package::function)) %>%
  package::function(package::function(package::function(., "[[:punct:]]", "")))


Actual Instructions

data <- data %>%
  janitor::clean_names() %>%
  janitor::remove_empty() %>%
  dplyr::mutate_if(is.character, tidyr::replace_na, replace = "unknown") %>%
  dplyr::mutate_if(is.numeric, tidyr::replace_na, replace = -999) %>%
  dplyr::mutate(dplyr::across(where(is.character), stringr::str_trim)) %>%
  dplyr::mutate(dplyr::across(where(is.character), stringr::str_to_upper)) %>%
  dplyr::mutate_all(dplyr::funs(stringr::str_replace_all(., "[[:punct:]]", "")))