| Read in CSV file |
Import |
mydt <- fread(โmyfile.csvโ) |
myt <- read_csv(โmyfile.csvโ) #OR
myt <- vroom::vroom(โmyfile.csvโ) |
| Import the first x number of rows of a CSV file |
Import |
mydt_x <- fread(โmyfile.csvโ, nrows = x) |
myt_x <- read_csv(โmyfile.csvโ, n_max = x) |
| Import only those rows from a CSV file that match a certain pattern |
Import |
mydt_pattern <- fread(โgrep โmypatternโ myfile.csvโ) |
myt_pattern <- vroom::vroom(pipe(โgrep โmypatternโ myfile.csvโ)) |
| Import a .gz compressed file |
Import |
mydt <- fread(โmyfile.gzโ) |
myt <- vroom::vroom(โmyfile.gzโ) |
| Import a.zip compressed file |
import |
mydt <- fread(cmd = โunzip -cq myfile.zipโ) |
myt <- read_csv(โmyfile.zipโ) |
| Create data table from existing data frame (tibble for tidyverse) |
Import |
mydt <- as.data.table(mydf) #OR
setDT(mydf) |
myt <- as_tibble(mydf) |
| Alter data.table in place without making a copy |
Wrangle |
any function that starts with set such as setkey(mydt, mycol) or using the := operator within brackets |
not applicable |
| Order rows based on multiple column values |
Wrangle |
mydt2 <- mydt[order(colA, -colB)] #OR
setorder(mydt, colA, -colB) |
myt <- arrange(myt, colA, -colB) |
| Rename columns |
Wrangle |
setnames(mydt, old = c(โcolAโ,โcolBโ), new = c(โNewColAโ, โNewColBโ)) |
myt <- rename(myt, NewColA = colA, NewColB = colB) |
| Reordering columns: Move some columns to the front (left-most) position |
Wrangle |
setcolorder(mydt, c(โcolBโ, โcolCโ)) # colB now in position 1 and colC in position 2 |
myt <- relocate(myt, colB, colC) |
| Filter rows for row number n |
Subset |
mydt2 <- mydt[n] |
myt2 <- slice(myt, n) |
| Filter for the last row |
Subset |
mydt2 <- mydt[.N] |
myt2 <- slice(myt, n()) |
| Filter rows by condition |
Subset |
# In some cases setkey(mydt, colA, colB) will speed performance
# for logical tests on colA and colB; same with other columns
mydt2 <- mydt[logical expression] |
myt2 <- filter(myt, logical expression) |
| Filter rows where colA equals string1 or string2 |
Subset |
mydt2 <- mydt[colA %chin% c(โstring1โ, โstring2โ)] |
myt2 <- filter(myt, colA %in% c(โstring1โ, โstring2โ)) |
| Filter rows where colA matches a regular expression |
Subset |
mydt2 <- mydt[colA %like% โmypatternโ] |
myt2 <- filter(myt, stringr::str_detect(colA, โmypatternโ)) |
| Filter rows where colA values are between 2 numbers |
Subset |
mydt2 <- mydt[colA %between% c(n1, n2)] |
myt2 <- filter(myt, between(colA, n1, n2)) |
| Filter for first n rows by group |
Subset |
mydt2 <- mydt[, .SD[1:n], by = groupcol] |
myt2 <- myt %>%
group_by(groupcol) %>%
slice(1:n) |
| Filter rows for maximum value by group |
Subset |
mydt2 <- mydt[, .SD[which.max(valcol)], by = groupcol] |
myt2 <- myt %>%
group_by(groupcol) %>%
filter(valcol == max(valcol)) |
| Select column and return results as a vector |
Subset |
myvec <- mydt[, colname] |
myvec <- pull(myt, colname) |
| Select multiple columns to create a new data.table (data frame or tibble for tidyverse) |
Subset |
mydt2 <- mydt[, list(colA, colB)] #OR
mydt2 <- mydt[, .(colA, colB)] #OR
mydt2 <- mydt[, c(โcolAโ, โcolBโ)] |
myt2 <- select(myt, colA, colB) |
| Select multiple columns using a variable containing the column names |
Subset |
my_col_names <- c(โcolAโ, โcolBโ)
mydt2 <- mydt[, ..my_col_names] #OR
mydt2 <- mydt[, my_col_names, with = FALSE] |
my_col_names <- c(โcolAโ, โcolBโ)
myt2 <- select(myt, all_of(my_col_names)) |
| Select multiple columns and rename some |
Subset |
mydt2 <- mydt[, .(newname1 = col1, newname2 = col2, col3)] |
myt2 <- select(myt, newname1 = col1, newname2 = col2, col3) |
| Exclude multiple columns |
Subset |
mydt2 <- mydt[, -c(โcolAโ, โcolBโ)] #OR
mydt2 <- mydt[, !c(โcolAโ, โcolBโ)] #OR
my_col_names <- c(โcolAโ, โcolBโ)
mydt2 <- mydt[, !..my_col_names] |
myt2 <- select(myt, -c(colA, colB)) #OR
my_col_names <- c(โcolAโ, โcolBโ)
myt2 <- select(myt, -{{my_col_names}}) |
| Remove duplicate rows based on values in multiple columns |
Subset |
mydt2 <- unique(mydt, by = c(โcolAโ, โcolBโ)) |
myt2 <- distinct(myt, colA, colB, .keep_all = TRUE) |
| Count unique rows based on multiple columns |
Summarize |
uniqueN(mydt, by = c(โcolAโ, โcolBโ)) |
nrow(distinct(myt, colA, colB)) |
| Run summary calculations on data |
Summarize |
mydt2 <- mydt[, myfun(colA โฆ)] |
myt2 <- myt %>%
summarise(
ColName = myfun(colA โฆ)
) |
| Run summary calculations on data by one group |
Summarize |
mydt2 <- mydt[, myfun(colA โฆ), by = groupcol] |
myt2 <- myt %>%
group_by(groupcol) %>%
summarise(
NewCol = myfun(colAโฆ)
) |
| Run summary calculations on data by one group and name new column |
Summarize |
mydt2 <- mydt[, .(MyNewCol = myfun(colAโฆ)), by = groupcol] |
myt2 <- myt %>%
group_by(groupcol) %>%
summarise(
NewCol = myfun(colAโฆ)
) |
| Run summary calculations on data by multiple groups |
Summarize |
mydt2 <- mydt[, myfun(colA โฆ), by = .(groupcol1, groupcol2)] |
myt2 <- myt %>%
group_by(groupcol1, groupcol2) %>%
summarise(
NewCol = myfun(colAโฆ)
) |
| Run summary calculation on filtered data by multiple groups |
Summarize |
mydt2 <- mydt[filter expression, myfun(colA), by = .(groupcol1, groupcol2)] |
myt2 <- myt %>%
filter(filter expression) %>%
group_by(groupcol1, groupcol2) %>%
summarise(
NewCol = myfun(colA), .groups = โkeepโ
) |
| Count number of rows by groups |
Summarize |
mydt2 <- mydt[,.N, by = groupcol] #for one group #OR
mydt2 <- mydt[, .N, by = .(groupcol1, groupcol2)] |
myt2 <- count(myt, groupcol) #for one group #OR
myt2 <- count(myt, groupcol1, groupcol2) |
| Summarize multiple columns and return results in multiple columns |
Summarize |
mydt2 <- mydt[, lapply(.SD, myfun),
.SDcols = c(โcolAโ, โcolBโ)] |
myt2 <- myt %>%
summarise(
across(c(colA, colB), myfun)
) |
| Summarize multiple columns by group and return results in multiple columns |
Summarize |
mydt2 <- mydt[, lapply(.SD, myfun),
.SDcols = c(โcolAโ, โcolBโ), by = groupcol] |
myt2 <- myt %>%
group_by(groupcol) %>%
summarise( across(c(colA, colB), myfun) ) |
| Add a column |
Calculate |
mydt[, MyNewCol := myfun(colA)] |
myt <- myt %>%
mutate(
MyNewCol = myfun(colA)
) |
| Add multiple columns at once |
Calculate |
# use any function or expression
mydt[, `:=`(NewCol1 = myfun(colA),
NewCol2 = colB + colC )] #OR
mydt[, c(โNewCol1โ, โnewCol2โ) := list(myfun(colA), colB + colC)] |
myt <- myt %>%
mutate(
MyNewCol1 = myfun(colA),
MyNewCol2 = colB + colC
) |
| Add column using current and previous values from another column, such as finding the difference between value on a date vs. the prior date |
Calculate |
mydt[, Diff := colA โ shift(colA)] |
myt <- mutate(myt, Diff = colA โ lag(colA)) |
| Add column referencing previous value of a column by a group |
Calculate |
mydt2 <- mydt[, Diff := colA โ shift(colA), by = groupcol] |
myt2 <- myt %>%
group_by(groupcol) %>%
mutate(
Diff = colA โ lag(colA)
) |
| Add column with row ID numbers by group |
Calculate |
mydt[, myid := 1:.N, by = groupcol] |
myt <- myt %>%
group_by(groupcol) %>%
mutate(
myid = row_number()
) |
| Add column based on several conditions without using multiple if else statements (like SQLโs CASE) |
Calculate |
# Needs data.table version 1.13 or later
# I like each condition on a new line but thatโs not required
mydt2 <- mydt[, NewCol := fcase(
condition1, โValue1โ,
condition2, โValue2โ,
condition3, โValue3โ,
default = โOtherโ # value for all else
)] |
myt2 <- myt %>%
mutate(
NewCol = case_when(
condition1 ~ โValue1โ,
condition2 ~ โValue2โ,
condition3 ~ โValue3โ,
TRUE ~ โOtherโ
)
) |
| Add column via operating by row |
Calculate |
mydt[, newcol := myfun(colB, colC, colD), by = 1:nrow(mydt)]
# or if colA has all unique values
mydt[, newcol := myfun(colB, colC, colD), by = colA] |
myt <- myt %>%
rowwise() %>%
mutate(
newcol = myfun(colB, colC, colD)
)
# or
myt <- myt %>%
rowwise() %>%
mutate(
#use dplyr select syntax:
newcol = myfun(c_across(colB:colD))
) |
| Join two data sets by more than one column; keep all in set1 but only matches in set2 |
Join |
mydt <- dt2[dt1, on = c(โdt2colโ = โdt1colโ)] #OR
mydt <- merge(dt1, dt2, by.x = โdt1colโ, by.y = โdt2colโ, all.x = TRUE) #OR
setkey(dt1, โdt1colโ)
setkey(dt2, โdt2colโ)
mydt <- dt2[dt1] |
myt <- left_join(df1, df2, by = c(โdf1colโ = โdf2colโ)) |
| Join 2 data sets by more than one column โ keep all in set1 but only matches in set2 |
Join |
mydt <- merge(dt1, dt2, by.x = c(โdt1colAโ, โdt1colBโ), by.y = c(โdt2colAโ, โdt2colBโ), all.x = TRUE, all.y = FALSE) #OR
setkey(dt1, dt1colA, dt1colB)
setkey(dt2, dt2colA, dt2colB)
mydt <- dt2[dt1] |
myt <- left_join(df1, df2, by = c(โdf1colAโ = โdf2colAโ, โdf1colBโ = โdf2colBโ)) |
| Join two data sets by one common column; only keep matches |
Join |
mydt <- merge(dt1, dt2, by.x = โdtcol1โ, by.y = โdtcol2โ) |
myt <- inner_join(df1, df2, by = c(โdf1colโ = โdf2colโ)) |
| Join two data sets by one common column and keep all data in both sets, whether or not there are matches |
Join |
mydt <- merge(dt1, dt2, by.x = โdtcol1โ, by.y = โdtcol2โ, all = TRUE) |
myt <- full_join(df1, df2, by = c(โdf1colโ = โdf2colโ)) |
| Combine two data sets by adding rows from one to the bottom of another |
Join |
mydt_joined <- rbindlist(list(mydt, mydt2)) |
myt_joined <- bind_rows(myt, myt2) |
| Reshape data wide to long |
Reshape |
mydt_long <- melt(mydt, measure.vars = c(โcol1โ, โcol2โ, โcol3โ), variable.name = โNewCategoryColNameโ, value.name = โNewValueColNameโ) |
myt_long <- pivot_longer(myt, cols = starts_with(โcolโ), names_to = โNewCategoryColNameโ, values_to = โNewValueColNameโ) |
| Reshape data long to wide |
Reshape |
mydt_wide <- dcast(mydt, id_col1 ~ col1 , value.var = โValueColNameโ) |
myt_wide <- pivot_wider(myt, names_from = col1, values_from =ValueColName) |
| Chain multiple expressions |
Wrangle |
mydt[expr1][expr2] |
myt <- myt %>%
expr1 %>%
expr2 |
| Export data to a CSV file |
Export |
fwrite(mydt, โmyfile.csvโ) |
write_csv(myt, โmyfile.csvโ) |
| Append rows to an existing CSV file |
Export |
fwrite(mydt2, โmyfile.csvโ, append = TRUE) |
vroom::vroom_write(myt2, โmyfile.csvโ, delim = โ,โ, append = TRUE) |
| Export data to a compressed CSV file |
Export |
fwrite(mydt, โmyfile.csv.gzโ, compress = โgzipโ) |
vroom::vroom_write(myt, โmyfile2.csv.gzโ) |