Archive for the ‘R’ Category

LightGBM Grid Search Example in R

data(agaricus.train, package = "lightgbm")
train <- agaricus.traindtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data = FALSE)
data(agaricus.test, package = "lightgbm")
test <- agaricus.testdtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
valids <- list(test = dtest)

grid_search <- expand.grid(Depth = 8,
                           L1 = 0:5,
                           L2 = 0:5)

model <- list()
perf <- numeric(nrow(grid_search))

for (i in 1:nrow(grid_search)) {
  model[[i]] <- lgb.train(list(objective = "regression",
                          metric = "l2",
                          lambda_l1 = grid_search[i, "L1"],
                          lambda_l2 = grid_search[i, "L2"],
                          max_depth = grid_search[i, "Depth"]),
                     min_data = 1,
                     learning_rate = 1,
                     early_stopping_rounds = 5)
  perf[i] <- min(rbindlist(model[[i]]$record_evals$test$l2))

> cat("Model ", which.min(perf), " is lowest loss: ", min(perf), sep = "")
Model 1 is lowest loss: 1.972152e-31> print(grid_search[which.min(perf), ])
  Depth L1 L21     8  0  0

Happy Pi Day 2016!

Has it really been a whole year?

On Pi Day 2015 posted a tribute to Pi (π) Day, published on GitHub, wherein we created fractals in R based on π, scraped and displayed information on Pi and other fun stuff.

This year, find out how Fibonacci numbers, which are sequences of integers, have a freaky relationship with π! View the entire script on GitHub.

# Pi Fibonacci Sequence ---------------------------------------------------
cat("This year, we'll look at the relationship between Pi and Fibonacci sequences. \n")
cat("Until very recently there were just two methods used to compute pi (π),
one invented by the Greek mathematician Archimedes,
and the other by the Scottish mathematician James Gregory. \n")

cat("If we use Sir Gregory's arc tangent method, you'll notice a pattern...")


pi/4 == atan(1)

atan(1/5)  + atan(1/8)

atan(1/13) + atan(1/21)

cat("We can combine what we saw above")
atan(1/2) + atan(1/3)
atan(1/2) + atan(1/5) + atan(1/8)

atan(1/34) + atan(1/55)

cat("You'll notice that the pattern is a Fibonacci sequence! \n")

cat(" We have just seen that there are infinitely many formulae for π using the Fibonacci numbers!")


R: beginning and end of month

date.end.month <- seq(as.Date("2012-02-01"),length=4,by="months")-1
## —————————       ##
## Function takes a vector of  ##
## dates as its input.                 ##
##                                                  ##
## It produces a vector of        ##
## dates that are the first         ##
## in their respective months ##
## —————————       ##
  day = format(x,format=”%d”)
  monthYr = format(x,format=”%Y-%m”)
  y = tapply(day,monthYr, min)

R: cbind fill for data.table

cbind.fill <- function(…){
nm <- list(…)
nm <- lapply(nm, as.matrix)
n <- max(sapply(nm, nrow)), lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))


R: Better detach package function

<div>detach_package <- function(pkg, character.only = FALSE)</div>
<div>    if(!character.only)</div>
<div>    {</div>
<div>        pkg <- deparse(substitute(pkg))</div>
<div>    }</div>
<div>    search_item <- paste("package", pkg, sep = ":")</div>
<div>    while(search_item %in% search())</div>
<div>    {</div>
<div>        detach(search_item, unload = TRUE, character.only = TRUE)</div>
<div>    }</div>


R: Remove constant and identical features programmatically

<div>##### Removing constant features</div>
<div>cat("\n## Removing the constants features.\n")</div>
<div>for (f in names(train)) {</div>
<div>  if (length(unique(train[[f]])) == 1) {</div>
<div>    cat(f, "is constant in train. We delete it.\n")</div>
<div>    train[[f]] <- NULL</div>
<div>    test[[f]] <- NULL</div>
<div>  }</div>
<div>##### Removing identical features</div>
<div>features_pair <- combn(names(train), 2, simplify = F)</div>
<div>toRemove <- c()</div>
<div>for(pair in features_pair) {</div>
<div>  f1 <- pair[1]</div>
<div>  f2 <- pair[2]</div>
<div>  if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {</div>
<div>    if (all(train[[f1]] == train[[f2]])) {</div>
<div>      cat(f1, "and", f2, "are equals.\n")</div>
<div>      toRemove <- c(toRemove, f2)</div>
<div>    }</div>
<div>  }</div>


R: microbenchmark, reshaping big data features

pacman::p_load(data.table, microbenchmark )

train train_mat

f1 f2



R: Remove constant and identical features programmatically

##### Removing constant features
cat("\n## Removing the constants features.\n")
for (f in names(train)) {
  if (length(unique(train[[f]])) == 1) {
    cat(f, "is constant in train. We delete it.\n")
    train[[f]] <- NULL
    test[[f]] <- NULL

##### Removing identical features
features_pair <- combn(names(train), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
  f1 <- pair[1]
  f2 <- pair[2]

  if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
    if (all(train[[f1]] == train[[f2]])) {
      cat(f1, "and", f2, "are equals.\n")
      toRemove <- c(toRemove, f2)


R: Setup a grid search for xgboost (!!)

I find this code super useful because R’s implementation of xgboost (and to my knowledge Python’s) otherwise lacks support for a grid search:

# set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
nrounds = 1000,
eta = c(0.01, 0.001, 0.0001),
max_depth = c(2, 4, 6, 8, 10),
gamma = 1

# pack the training control parameters
xgb_trcontrol_1 = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
returnData = FALSE,
returnResamp = "all",                                                        # save losses across all models
classProbs = TRUE,                                                           # set to TRUE for AUC to be computed
summaryFunction = twoClassSummary,
allowParallel = TRUE

# train the model for each parameter combination in the grid,
#   using CV to evaluate
xgb_train_1 = train(
x = as.matrix(df_train %>%
y = as.factor(df_train$SeriousDlqin2yrs),
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree"

# scatter plot of the AUC against max_depth and eta
ggplot(xgb_train_1$results, aes(x = as.factor(eta), y = max_depth, size = ROC, color = ROC)) +
geom_point() +
theme_bw() +
scale_size_continuous(guide = "none")</code>


How to Conditionally Remove Character of a Vector Element in R

I have (sometimes incomplete) data on addresses that looks like this:

data <- c("1600 Pennsylvania Avenue, Washington DC", 
          ",Siem Reap,FC,", "11 Wall Street, New York, NY", ",Addis Ababa,FC,")  

where I need to remove the first and/or last character if either one of them are a comma.

Avinash Raj was able to help me with this on S.O. and the question turned out to be a popular one, so I’ll show the solution here:

> data <- c("1600 Pennsylvania Avenue, Washington DC", 
+           ",Siem Reap,FC,", "11 Wall Street, New York, NY", ",Addis Ababa,FC,")
> gsub("(?<=^),|,(?=$)", "", data, perl=TRUE)
[1] "1600 Pennsylvania Avenue, Washington DC"
[2] "Siem Reap,FC"                           
[3] "11 Wall Street, New York, NY"           
[4] "Addis Ababa,FC" 

Pattern explanation:

  • (?<=^), In regex (?<=) called positive look-behind. In our case it asserts What precedes the comma must be a line start ^. So it matches the starting comma.
  • | Logical OR operator usually used to combine(ie, ORing) two regexes.
  • ,(?=$) Lookahead aseerts that what follows comma must be a line end $. So it matches the comma present at the line end.