Difference between revisions of "R: dplyr package"

From RHS Wiki
Jump to navigation Jump to search
m (Protected "R: dplyr package" ([Edit=Allow only administrators] (indefinite) [Move=Allow only administrators] (indefinite)))
 
(2 intermediate revisions by the same user not shown)
Line 37: Line 37:
 
summarize(cran, avg_bytes = mean(size))
 
summarize(cran, avg_bytes = mean(size))
  
 +
pack_sum <- summarize(by_package,
 +
                      count = n(),
 +
                      unique = n_distinct(ip_id),
 +
                      countries = n_distinct(country),
 +
                      avg_bytes = mean(size))
  
 +
quantile(pack_sum$count, probs = 0.99)
 +
top_counts<-filter(pack_sum, count > 679)
 +
View(top_counts)
 +
top_counts_sorted<-arrange(top_counts, desc(count))
 +
View(top_counts_sorted)
 +
 +
# group_by()
 +
by_package <- group_by(cran, package)
 +
summarize(by_package, mean(size))
 +
</source>
 +
 +
Rank packages.
 +
<source lang="rsplus">
 +
result2 <-
 +
  arrange(
 +
    filter(
 +
      summarize(group_by(cran, package),
 +
        count = n(),
 +
        unique = n_distinct(ip_id),
 +
        countries = n_distinct(country),
 +
        avg_bytes = mean(size)
 +
      ),
 +
      countries > 60
 +
    ), desc(countries), avg_bytes
 +
  )
 +
 +
print(result2)
 +
</source>
 +
 +
With pipelines
 +
<source lang="rsplus">
 +
result3 <-
 +
  cran %>%
 +
  group_by(package) %>%
 +
  summarize(count = n(),
 +
            unique = n_distinct(ip_id),
 +
            countries = n_distinct(country),
 +
            avg_bytes = mean(size)
 +
  ) %>%
 +
  filter(countries > 60) %>%
 +
  arrange(desc(countries), avg_bytes)
 +
 +
# Print result to console
 +
print(result3)
 
</source>
 
</source>

Latest revision as of 13:09, 23 April 2015

library(dplyr)
packageVersion("dplyr")
mydf <- read.csv(path2csv, stringsAsFactors=FALSE)
cran <- tbl_df(mydf)

# fundamental data manipulation tasks: select(), filter(), arrange(), mutate(), and summarize()

# select()
select(cran, ip_id, package, country)
select(cran, r_arch:country)
select(cran, country:r_arch)

# Omit columns
select(cran, -time)
select(cran, -(X:size))

# filter()
filter(cran, package == "swirl")
filter(cran, r_version == "3.1.1", country == "US")
filter(cran, country == "US" | country == "IN")
filter(cran, size > 100500, r_os == "linux-gnu")
filter(cran, !is.na(r_version))

# arrange()
cran2 <- select(cran, size:ip_id)
arrange(cran2, ip_id)
arrange(cran2, desc(ip_id))
arrange(cran2, package, ip_id)
arrange(cran2, country, desc(r_version), ip_id)

# mutate()
mutate(cran3, size_mb = size / 2^20)
mutate(cran3, size_mb = size / 2^20, size_gb = size_mb / 2^10)

# summarize()
summarize(cran, avg_bytes = mean(size))

pack_sum <- summarize(by_package,
                      count = n(),
                      unique = n_distinct(ip_id),
                      countries = n_distinct(country),
                      avg_bytes = mean(size))

quantile(pack_sum$count, probs = 0.99)
top_counts<-filter(pack_sum, count > 679)
View(top_counts)
top_counts_sorted<-arrange(top_counts, desc(count))
View(top_counts_sorted)

# group_by()
by_package <- group_by(cran, package)
summarize(by_package, mean(size))

Rank packages.

result2 <-
  arrange(
    filter(
      summarize(group_by(cran, package), 
        count = n(), 
        unique = n_distinct(ip_id),
        countries = n_distinct(country),
        avg_bytes = mean(size)
      ),
      countries > 60
    ), desc(countries), avg_bytes
  )

print(result2)

With pipelines

result3 <-
  cran %>%
  group_by(package) %>%
  summarize(count = n(),
            unique = n_distinct(ip_id),
            countries = n_distinct(country),
            avg_bytes = mean(size)
  ) %>%
  filter(countries > 60) %>%
  arrange(desc(countries), avg_bytes)

# Print result to console
print(result3)