R: tidyr

From RHS Wiki
Jump to navigation Jump to search

tidyr Package[edit]

http://vita.had.co.nz/papers/tidy-data.pdf

# gather()
# Having a dataset "students" with colums "grade", "male", "female"
gather(students, sex, count, -grade)
# Will return a dataset with the colums grade, sex, count

If we have a dataset with multiple variables stored in one column, for example:
grade, male_1, female_1, male_2, female_2, are colums storing grades for two diferent classes.

res <- gather(students2, sex_class, count, -grade)
separate(res, col=sex_class, into = c("sex", "class"))

# or using pipelines:
students2 %>%
  gather( sex_class, count, -grade) %>%
  separate( col=sex_class, into = c("sex", "class")) %>%
  print

If there are variables stored in both rows ad colums:

> students3
    name    test class1 class2 class3 class4 class5
1  Sally midterm      A   <NA>      B   <NA>   <NA>
2  Sally   final      C   <NA>      C   <NA>   <NA>

This code will tidy the data:

students3 %>%
  gather(class, grade, class1:class5, na.rm = TRUE) %>%
  print

spread()
Spreads a colum into its variables.

students3 %>%
  gather(class, grade, class1:class5, na.rm = TRUE) %>%
  spread(test, grade) %>%
  print

To change the values from clas1, clas2... to 1, 2...

students3 %>%
  gather(class, grade, class1:class5, na.rm = TRUE) %>%
  spread(test, grade) %>%
  mutate(class= extract_numeric(class)) %>%
  print

Tidy multiple observational units in one table (id, name, sex repeated...):

> students4
    id  name sex class midterm final
1  168 Brian   F     1       B     B
2  168 Brian   F     5       A     C
3  588 Sally   M     1       A     C
4  588 Sally   M     3       B     C
5  710  Jeff   M     2       D     E
6  710  Jeff   M     4       A     C
7  731 Roger   F     2       C     A
8  731 Roger   F     5       B     A
9  908 Karen   M     3       C     C
10 908 Karen   M     4       A     A

To solve this will breake the dataset in two:
(id, name, and sex)
(id, class, midterm, final)

student_info <- students4 %>%
  select(id, name, sex) %>%
  unique() %>%
  print
gradebook <- students4 %>%
  select(id, class, midterm, final) %>%
  print

If a single observation is stored in multiple tables:

> passed
   name class final
1 Brian     1     B
2 Roger     2     A
3 Roger     5     A
4 Karen     4     A

> failed
   name class final
1 Brian     5     C
2 Sally     1     C
3 Sally     3     C
4  Jeff     2     E
5  Jeff     4     C
6 Karen     3     C
# first we need to add to each table the variable passed or failed accordingly.
passed <- mutate(passed, status="passed")
failed <- mutate(failed, status="failed")

Example: dataset -> http://research.collegeboard.org/programs/sat/data/cb-seniors-2013

# Load dataset in variable sat
sat %>%
  select(-contains("total")) %>%
  gather(part_sex, count, -score_range) %>%
  separate(part_sex, c("part", "sex")) %>%
  ### <Your call to group_by()> %>%
  mutate(###,
         ###
  ) %>% print

# The same adding total and prop
sat %>%
  select(-contains("total")) %>%
  gather(part_sex, count, -score_range) %>%
  separate(part_sex, c("part", "sex")) %>%
  group_by(part, sex) %>%
  mutate(total = sum(count),
         prop = count/total
  ) %>% print