Difference between revisions of "R: dplyr package"
Jump to navigation
Jump to search
Rafahsolis (talk | contribs) m (Protected "R: dplyr package" ([Edit=Allow only administrators] (indefinite) [Move=Allow only administrators] (indefinite))) |
Rafahsolis (talk | contribs) |
||
| Line 37: | Line 37: | ||
summarize(cran, avg_bytes = mean(size)) | summarize(cran, avg_bytes = mean(size)) | ||
| + | pack_sum <- summarize(by_package, | ||
| + | count = n(), | ||
| + | unique = n_distinct(ip_id), | ||
| + | countries = n_distinct(country), | ||
| + | avg_bytes = mean(size)) | ||
| + | quantile(pack_sum$count, probs = 0.99) | ||
| + | top_counts<-filter(pack_sum, count > 679) | ||
| + | View(top_counts) | ||
| + | top_counts_sorted<-arrange(top_counts, desc(count)) | ||
| + | View(top_counts_sorted) | ||
| + | |||
| + | # group_by() | ||
| + | by_package <- group_by(cran, package) | ||
| + | summarize(by_package, mean(size)) | ||
| + | </source> | ||
| + | |||
| + | Rank packages. | ||
| + | <source lang="rsplus"> | ||
| + | result2 <- | ||
| + | arrange( | ||
| + | filter( | ||
| + | summarize( | ||
| + | group_by(cran, | ||
| + | package | ||
| + | ), | ||
| + | count = n(), | ||
| + | unique = n_distinct(ip_id), | ||
| + | countries = n_distinct(country), | ||
| + | avg_bytes = mean(size) | ||
| + | ), | ||
| + | countries > 60 | ||
| + | ), | ||
| + | desc(countries), | ||
| + | avg_bytes | ||
| + | ) | ||
| + | |||
| + | print(result2) | ||
| + | </source> | ||
| + | |||
| + | <source lang="rsplus"> | ||
| + | result3 <- | ||
| + | cran %>% | ||
| + | group_by(package) %>% | ||
| + | summarize(count = n(), | ||
| + | unique = n_distinct(ip_id), | ||
| + | countries = n_distinct(country), | ||
| + | avg_bytes = mean(size) | ||
| + | ) %>% | ||
| + | filter(countries > 60) %>% | ||
| + | arrange(desc(countries), avg_bytes) | ||
| + | |||
| + | # Print result to console | ||
| + | print(result3) | ||
</source> | </source> | ||
Revision as of 00:34, 23 April 2015
library(dplyr)
packageVersion("dplyr")
mydf <- read.csv(path2csv, stringsAsFactors=FALSE)
cran <- tbl_df(mydf)
# fundamental data manipulation tasks: select(), filter(), arrange(), mutate(), and summarize()
# select()
select(cran, ip_id, package, country)
select(cran, r_arch:country)
select(cran, country:r_arch)
# Omit columns
select(cran, -time)
select(cran, -(X:size))
# filter()
filter(cran, package == "swirl")
filter(cran, r_version == "3.1.1", country == "US")
filter(cran, country == "US" | country == "IN")
filter(cran, size > 100500, r_os == "linux-gnu")
filter(cran, !is.na(r_version))
# arrange()
cran2 <- select(cran, size:ip_id)
arrange(cran2, ip_id)
arrange(cran2, desc(ip_id))
arrange(cran2, package, ip_id)
arrange(cran2, country, desc(r_version), ip_id)
# mutate()
mutate(cran3, size_mb = size / 2^20)
mutate(cran3, size_mb = size / 2^20, size_gb = size_mb / 2^10)
# summarize()
summarize(cran, avg_bytes = mean(size))
pack_sum <- summarize(by_package,
count = n(),
unique = n_distinct(ip_id),
countries = n_distinct(country),
avg_bytes = mean(size))
quantile(pack_sum$count, probs = 0.99)
top_counts<-filter(pack_sum, count > 679)
View(top_counts)
top_counts_sorted<-arrange(top_counts, desc(count))
View(top_counts_sorted)
# group_by()
by_package <- group_by(cran, package)
summarize(by_package, mean(size))
Rank packages.
result2 <-
arrange(
filter(
summarize(
group_by(cran,
package
),
count = n(),
unique = n_distinct(ip_id),
countries = n_distinct(country),
avg_bytes = mean(size)
),
countries > 60
),
desc(countries),
avg_bytes
)
print(result2)
result3 <-
cran %>%
group_by(package) %>%
summarize(count = n(),
unique = n_distinct(ip_id),
countries = n_distinct(country),
avg_bytes = mean(size)
) %>%
filter(countries > 60) %>%
arrange(desc(countries), avg_bytes)
# Print result to console
print(result3)