Read in the data

require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(ggplot2)
## Loading required package: ggplot2
df <- tbl_df(fread("pp_ccwid_bn_2012_r.csv", header=T))
## 
Read 0.0% of 21970751 rows
Read 6.2% of 21970751 rows
Read 12.3% of 21970751 rows
Read 18.6% of 21970751 rows
Read 24.7% of 21970751 rows
Read 30.9% of 21970751 rows
Read 37.0% of 21970751 rows
Read 43.2% of 21970751 rows
Read 49.4% of 21970751 rows
Read 55.7% of 21970751 rows
Read 61.9% of 21970751 rows
Read 68.1% of 21970751 rows
Read 74.3% of 21970751 rows
Read 80.5% of 21970751 rows
Read 86.6% of 21970751 rows
Read 92.5% of 21970751 rows
Read 98.3% of 21970751 rows
Read 21970751 rows and 9 (of 9) columns from 1.370 GB file in 00:00:20

Find top 50 drugs by claim count

d <- df %>% group_by(BN) %>% summarize(tot_claim = sum(CLAIM_COUNT)) %>% arrange(desc(tot_claim))
qplot(x=reorder(BN, -tot_claim), y=tot_claim, geom="bar", stat="identity", data=d[1:50,]) + 
  labs(title="Top 50 drugs by claim count", x="Drug", y="Claims") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle=45, hjust=1))

Find the top 50 drugs by gross expense

d <- df %>% group_by(BN) %>%  summarize(tot_cost = sum(GROSS_DRUG_COST_SUM1)/1e6) %>% 
  arrange(desc(tot_cost))
qplot(x=reorder(BN, -tot_cost), y=tot_cost, geom="bar", stat="identity", data=d[1:50,]) +
  labs(title="Top 50 drugs by cost", x="Drug", y="Cost in millions ($)") +
  theme_bw() + 
   theme(axis.text.x = element_text(angle=45, hjust=1))

Find top 50 drug classes by claims count

df <- tbl_df(fread("pp_part_d_with_category.csv", header=T))
## 
Read 2.6% of 10186671 rows
Read 20.1% of 10186671 rows
Read 34.8% of 10186671 rows
Read 50.5% of 10186671 rows
Read 66.6% of 10186671 rows
Read 82.4% of 10186671 rows
Read 98.0% of 10186671 rows
Read 10186671 rows and 10 (of 10) columns from 0.692 GB file in 00:00:09
d <- df %>% group_by(CATEGORY) %>% summarize(tot_claim = sum(CLAIM_COUNT)) %>%
  arrange(desc(tot_claim))

qplot(x=reorder(CATEGORY, -tot_claim), y=tot_claim, geom="bar", stat="identity", data=d[1:50,]) + 
  labs(title="Top 50 drug categories by claim count", x="Drug category", y="Claims") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle=45, hjust=1))

pdf("test.pdf", width=6, height=4)
qplot(x=CATEGORY, y=CLAIM_COUNT, geom="boxplot", data=df %>% filter(CATEGORY %in% d$CATEGORY[1:50]), outlier.size=.8) + 
  labs(title="Top 50 drug categories by claim count", x="Drug category", y="Claims") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle=45, hjust=1))
dev.off()
## pdf 
##   2

Find top 50 drug classes by gross cost

d <- df %>% group_by(CATEGORY) %>% summarize(tot_cost = sum(GROSS_DRUG_COST_SUM1)/1e6) %>%
  arrange(desc(tot_cost))

qplot(x=reorder(CATEGORY, -tot_cost), y=tot_cost, geom="bar", stat="identity", data=d[1:50,]) +
  labs(title="Top 50 drug categories by total cost", x="Drug category", y="Cost in millions ($)") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle=45, hjust=1))

d <- df %>% filter(CATEGORY %in% d$CATEGORY[1:50])

qplot(x=CATEGORY, y=GROSS_DRUG_COST_SUM1, geom="boxplot", data=d, outlier.size=0.8) +
  labs(title="Top 50 drug categories by total cost", x="Drug category", y="Cost in dollars ($)") + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle=45, hjust=1))

d <- df %>% group_by(CATEGORY) %>% summarize(tot_cost = sum(GROSS_DRUG_COST_SUM1)) %>%
  arrange(desc(tot_cost))
qplot(x=tot_cost, y=reorder(CATEGORY, tot_cost), geom="point", data=d) +
  geom_point(shape=5) +
  labs(title="Total cost for each drug category", x="Cost ($)", y="Drug category") +
  theme_bw()