Read in the data
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggplot2)
## Loading required package: ggplot2
df <- tbl_df(fread("pp_ccwid_bn_2012_r.csv", header=T))
##
Read 0.0% of 21970751 rows
Read 6.2% of 21970751 rows
Read 12.3% of 21970751 rows
Read 18.6% of 21970751 rows
Read 24.7% of 21970751 rows
Read 30.9% of 21970751 rows
Read 37.0% of 21970751 rows
Read 43.2% of 21970751 rows
Read 49.4% of 21970751 rows
Read 55.7% of 21970751 rows
Read 61.9% of 21970751 rows
Read 68.1% of 21970751 rows
Read 74.3% of 21970751 rows
Read 80.5% of 21970751 rows
Read 86.6% of 21970751 rows
Read 92.5% of 21970751 rows
Read 98.3% of 21970751 rows
Read 21970751 rows and 9 (of 9) columns from 1.370 GB file in 00:00:20
Find top 50 drugs by claim count
d <- df %>% group_by(BN) %>% summarize(tot_claim = sum(CLAIM_COUNT)) %>% arrange(desc(tot_claim))
qplot(x=reorder(BN, -tot_claim), y=tot_claim, geom="bar", stat="identity", data=d[1:50,]) +
labs(title="Top 50 drugs by claim count", x="Drug", y="Claims") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
Find the top 50 drugs by gross expense
d <- df %>% group_by(BN) %>% summarize(tot_cost = sum(GROSS_DRUG_COST_SUM1)/1e6) %>%
arrange(desc(tot_cost))
qplot(x=reorder(BN, -tot_cost), y=tot_cost, geom="bar", stat="identity", data=d[1:50,]) +
labs(title="Top 50 drugs by cost", x="Drug", y="Cost in millions ($)") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
Find top 50 drug classes by claims count
df <- tbl_df(fread("pp_part_d_with_category.csv", header=T))
##
Read 2.6% of 10186671 rows
Read 20.1% of 10186671 rows
Read 34.8% of 10186671 rows
Read 50.5% of 10186671 rows
Read 66.6% of 10186671 rows
Read 82.4% of 10186671 rows
Read 98.0% of 10186671 rows
Read 10186671 rows and 10 (of 10) columns from 0.692 GB file in 00:00:09
d <- df %>% group_by(CATEGORY) %>% summarize(tot_claim = sum(CLAIM_COUNT)) %>%
arrange(desc(tot_claim))
qplot(x=reorder(CATEGORY, -tot_claim), y=tot_claim, geom="bar", stat="identity", data=d[1:50,]) +
labs(title="Top 50 drug categories by claim count", x="Drug category", y="Claims") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
pdf("test.pdf", width=6, height=4)
qplot(x=CATEGORY, y=CLAIM_COUNT, geom="boxplot", data=df %>% filter(CATEGORY %in% d$CATEGORY[1:50]), outlier.size=.8) +
labs(title="Top 50 drug categories by claim count", x="Drug category", y="Claims") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
dev.off()
## pdf
## 2
Find top 50 drug classes by gross cost
d <- df %>% group_by(CATEGORY) %>% summarize(tot_cost = sum(GROSS_DRUG_COST_SUM1)/1e6) %>%
arrange(desc(tot_cost))
qplot(x=reorder(CATEGORY, -tot_cost), y=tot_cost, geom="bar", stat="identity", data=d[1:50,]) +
labs(title="Top 50 drug categories by total cost", x="Drug category", y="Cost in millions ($)") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
d <- df %>% filter(CATEGORY %in% d$CATEGORY[1:50])
qplot(x=CATEGORY, y=GROSS_DRUG_COST_SUM1, geom="boxplot", data=d, outlier.size=0.8) +
labs(title="Top 50 drug categories by total cost", x="Drug category", y="Cost in dollars ($)") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
d <- df %>% group_by(CATEGORY) %>% summarize(tot_cost = sum(GROSS_DRUG_COST_SUM1)) %>%
arrange(desc(tot_cost))
qplot(x=tot_cost, y=reorder(CATEGORY, tot_cost), geom="point", data=d) +
geom_point(shape=5) +
labs(title="Total cost for each drug category", x="Cost ($)", y="Drug category") +
theme_bw()