| Line 1: |
Line 1: |
| | == tidyr Package == | | == tidyr Package == |
| | http://vita.had.co.nz/papers/tidy-data.pdf | | http://vita.had.co.nz/papers/tidy-data.pdf |
| | + | |
| | <source lang="rsplus"> | | <source lang="rsplus"> |
| | # gather() | | # gather() |
| Line 10: |
Line 11: |
| | If we have a dataset with multiple variables stored in one column, for example:<br /> | | If we have a dataset with multiple variables stored in one column, for example:<br /> |
| | grade, male_1, female_1, male_2, female_2, are colums storing grades for two diferent classes. | | grade, male_1, female_1, male_2, female_2, are colums storing grades for two diferent classes. |
| | + | |
| | <source lang="rsplus"> | | <source lang="rsplus"> |
| | res <- gather(students2, sex_class, count, -grade) | | res <- gather(students2, sex_class, count, -grade) |
| Line 28: |
Line 30: |
| | 2 Sally final C <NA> C <NA> <NA></nowiki> | | 2 Sally final C <NA> C <NA> <NA></nowiki> |
| | | | |
| | + | This code will tidy the data: |
| | <source lang="rsplus"> | | <source lang="rsplus"> |
| | students3 %>% | | students3 %>% |
| | gather(class, grade, class1:class5, na.rm = TRUE) %>% | | gather(class, grade, class1:class5, na.rm = TRUE) %>% |
| | print | | print |
| | + | </source> |
| | + | spread()<br /> |
| | + | Spreads a colum into its variables. |
| | + | <source lang="rsplus"> |
| | + | students3 %>% |
| | + | gather(class, grade, class1:class5, na.rm = TRUE) %>% |
| | + | spread(test, grade) %>% |
| | + | print |
| | + | </source> |
| | + | To change the values from clas1, clas2... to 1, 2... |
| | + | <source lang="rsplus"> |
| | + | students3 %>% |
| | + | gather(class, grade, class1:class5, na.rm = TRUE) %>% |
| | + | spread(test, grade) %>% |
| | + | mutate(class= extract_numeric(class)) %>% |
| | + | print |
| | + | </source> |
| | + | Tidy multiple observational units in one table (id, name, sex repeated...): |
| | + | <nowiki> |
| | + | > students4 |
| | + | id name sex class midterm final |
| | + | 1 168 Brian F 1 B B |
| | + | 2 168 Brian F 5 A C |
| | + | 3 588 Sally M 1 A C |
| | + | 4 588 Sally M 3 B C |
| | + | 5 710 Jeff M 2 D E |
| | + | 6 710 Jeff M 4 A C |
| | + | 7 731 Roger F 2 C A |
| | + | 8 731 Roger F 5 B A |
| | + | 9 908 Karen M 3 C C |
| | + | 10 908 Karen M 4 A A</nowiki> |
| | + | To solve this will breake the dataset in two:<br /> |
| | + | (id, name, and sex)<br /> |
| | + | (id, class, midterm, final)<br /> |
| | + | <source lang="rsplus"> |
| | + | student_info <- students4 %>% |
| | + | select(id, name, sex) %>% |
| | + | unique() %>% |
| | + | print |
| | + | gradebook <- students4 %>% |
| | + | select(id, class, midterm, final) %>% |
| | + | print |
| | + | </source> |
| | + | If a single observation is stored in multiple tables: |
| | + | <nowiki> |
| | + | > passed |
| | + | name class final |
| | + | 1 Brian 1 B |
| | + | 2 Roger 2 A |
| | + | 3 Roger 5 A |
| | + | 4 Karen 4 A |
| | + | |
| | + | > failed |
| | + | name class final |
| | + | 1 Brian 5 C |
| | + | 2 Sally 1 C |
| | + | 3 Sally 3 C |
| | + | 4 Jeff 2 E |
| | + | 5 Jeff 4 C |
| | + | 6 Karen 3 C</nowiki> |
| | + | |
| | + | <source lang="rsplus"> |
| | + | # first we need to add to each table the variable passed or failed accordingly. |
| | + | passed <- mutate(passed, status="passed") |
| | + | failed <- mutate(failed, status="failed") |
| | + | </source> |
| | + | |
| | + | Example: dataset -> http://research.collegeboard.org/programs/sat/data/cb-seniors-2013 |
| | + | <source lang="rsplus"> |
| | + | # Load dataset in variable sat |
| | + | sat %>% |
| | + | select(-contains("total")) %>% |
| | + | gather(part_sex, count, -score_range) %>% |
| | + | separate(part_sex, c("part", "sex")) %>% |
| | + | ### <Your call to group_by()> %>% |
| | + | mutate(###, |
| | + | ### |
| | + | ) %>% print |
| | + | |
| | + | # The same adding total and prop |
| | + | sat %>% |
| | + | select(-contains("total")) %>% |
| | + | gather(part_sex, count, -score_range) %>% |
| | + | separate(part_sex, c("part", "sex")) %>% |
| | + | group_by(part, sex) %>% |
| | + | mutate(total = sum(count), |
| | + | prop = count/total |
| | + | ) %>% print |
| | </source> | | </source> |