Archive for the ‘Math & Statistics’ Category

LightGBM Grid Search Example in R

library(data.table)
library(lightgbm)
data(agaricus.train, package = "lightgbm")
train <- agaricus.traindtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data = FALSE)
data(agaricus.test, package = "lightgbm")
test <- agaricus.testdtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
valids <- list(test = dtest)

grid_search <- expand.grid(Depth = 8,
                           L1 = 0:5,
                           L2 = 0:5)

model <- list()
perf <- numeric(nrow(grid_search))

for (i in 1:nrow(grid_search)) {
  model[[i]] <- lgb.train(list(objective = "regression",
                          metric = "l2",
                          lambda_l1 = grid_search[i, "L1"],
                          lambda_l2 = grid_search[i, "L2"],
                          max_depth = grid_search[i, "Depth"]),
                     dtrain,
                     2,
                     valids,
                     min_data = 1,
                     learning_rate = 1,
                     early_stopping_rounds = 5)
  perf[i] <- min(rbindlist(model[[i]]$record_evals$test$l2))
}

Result:
> cat("Model ", which.min(perf), " is lowest loss: ", min(perf), sep = "")
Model 1 is lowest loss: 1.972152e-31> print(grid_search[which.min(perf), ])
  Depth L1 L21     8  0  0

Example XGboost Grid Search in Python

import sys
import math
 
import numpy as np
from sklearn.grid_search import GridSearchCV
 
sys.path.append('xgboost/wrapper/')import xgboost as xgb
 
 
class XGBoostClassifier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
        self.params.update({'objective': 'multi:softprob'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        self.label2num = dict((label, i) for i, label in enumerate(sorted(set(y))))
        dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in y])
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
 
    def predict(self, X):
        num2label = dict((i, label)for label, i in self.label2num.items())
        Y = self.predict_proba(X)
        y = np.argmax(Y, axis=1)
        return np.array([num2label[i] for i in y])
 
    def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
 
    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / logloss(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
   
   
def logloss(y_true, Y_pred):
    label2num = dict((name, i) for i, name in enumerate(sorted(set(y_true))))
    return -1 * sum(math.log(y[label2num[label]]) if y[label2num[label]] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred)


def main():
    clf = XGBoostClassifier(
        eval_metric = 'auc',
        num_class = 2,
        nthread = 4,
        eta = 0.1,
        num_boost_round = 80,
        max_depth = 12,
        subsample = 0.5,
        colsample_bytree = 1.0,
        silent = 1,
        )
    parameters = {
        'num_boost_round': [100, 250, 500],
        'eta': [0.05, 0.1, 0.3],
        'max_depth': [6, 9, 12],
        'subsample': [0.9, 1.0],
        'colsample_bytree': [0.9, 1.0],
    }
    clf = GridSearchCV(clf, parameters, n_jobs=1, cv=2)
   
    clf.fit([[1,2], [3,4], [2,1], [4,3], [1,0], [4,5]], ['a', 'b', 'a', 'b', 'a', 'b'])
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print(score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
               
    print(clf.predict([[1,2]]))


if __name__ == '__main__':
    main()

Happy Pi Day 2016!

Has it really been a whole year?

On Pi Day 2015 hack-r.com posted a tribute to Pi (π) Day, published on GitHub, wherein we created fractals in R based on π, scraped and displayed information on Pi and other fun stuff.

This year, find out how Fibonacci numbers, which are sequences of integers, have a freaky relationship with π! View the entire script on GitHub.


# Pi Fibonacci Sequence ---------------------------------------------------
cat("This year, we'll look at the relationship between Pi and Fibonacci sequences. \n")
cat("Until very recently there were just two methods used to compute pi (π),
one invented by the Greek mathematician Archimedes,
and the other by the Scottish mathematician James Gregory. \n")

cat("If we use Sir Gregory's arc tangent method, you'll notice a pattern...")

pi/4
atan(1)

pi/4 == atan(1)

atan(1/3)
atan(1/5)  + atan(1/8)

atan(1/8)
atan(1/13) + atan(1/21)

cat("We can combine what we saw above")
pi/4
atan(1/2) + atan(1/3)
atan(1/2) + atan(1/5) + atan(1/8)

atan(1/21)
atan(1/34) + atan(1/55)

cat("You'll notice that the pattern is a Fibonacci sequence! \n")

cat(" We have just seen that there are infinitely many formulae for π using the Fibonacci numbers!")

pi

R: Remove constant and identical features programmatically

<div>##### Removing constant features</div>
<div>cat("\n## Removing the constants features.\n")</div>
<div>for (f in names(train)) {</div>
<div>  if (length(unique(train[[f]])) == 1) {</div>
<div>    cat(f, "is constant in train. We delete it.\n")</div>
<div>    train[[f]] <- NULL</div>
<div>    test[[f]] <- NULL</div>
<div>  }</div>
<div>}</div>
<div></div>
<div>##### Removing identical features</div>
<div>features_pair <- combn(names(train), 2, simplify = F)</div>
<div>toRemove <- c()</div>
<div>for(pair in features_pair) {</div>
<div>  f1 <- pair[1]</div>
<div>  f2 <- pair[2]</div>
<div></div>
<div>  if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {</div>
<div>    if (all(train[[f1]] == train[[f2]])) {</div>
<div>      cat(f1, "and", f2, "are equals.\n")</div>
<div>      toRemove <- c(toRemove, f2)</div>
<div>    }</div>
<div>  }</div>
<div>}</div>
<div></div>
<div>

RStudio-icon

R: microbenchmark, reshaping big data features


pacman::p_load(data.table, microbenchmark )

train train_mat

f1 f2

microbenchmark(f1(),f2(),times=10)

RStudio-icon

Kaggle – my brief shining moment in the top 10

I started playing with the (all too addictive) Kaggle competitions this past December, on and off.

This past week I reached a personal high point by making the top 10 in a featured competition for the first time.

Capture

Since then, my ranking has dropped a bit, but there’s still time for me to take first! 😉 Just don’t hold your breath…

R: Remove constant and identical features programmatically


##### Removing constant features
cat("\n## Removing the constants features.\n")
for (f in names(train)) {
  if (length(unique(train[[f]])) == 1) {
    cat(f, "is constant in train. We delete it.\n")
    train[[f]] <- NULL
    test[[f]] <- NULL
  }
}

##### Removing identical features
features_pair <- combn(names(train), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
  f1 <- pair[1]
  f2 <- pair[2]

  if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
    if (all(train[[f1]] == train[[f2]])) {
      cat(f1, "and", f2, "are equals.\n")
      toRemove <- c(toRemove, f2)
    }
  }
}

RStudio-icon

R: Setup a grid search for xgboost (!!)

I find this code super useful because R’s implementation of xgboost (and to my knowledge Python’s) otherwise lacks support for a grid search:

# set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
nrounds = 1000,
eta = c(0.01, 0.001, 0.0001),
max_depth = c(2, 4, 6, 8, 10),
gamma = 1
)

# pack the training control parameters
xgb_trcontrol_1 = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
returnData = FALSE,
returnResamp = "all",                                                        # save losses across all models
classProbs = TRUE,                                                           # set to TRUE for AUC to be computed
summaryFunction = twoClassSummary,
allowParallel = TRUE
)

# train the model for each parameter combination in the grid,
#   using CV to evaluate
xgb_train_1 = train(
x = as.matrix(df_train %>%
select(-SeriousDlqin2yrs)),
y = as.factor(df_train$SeriousDlqin2yrs),
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree"
)

# scatter plot of the AUC against max_depth and eta
ggplot(xgb_train_1$results, aes(x = as.factor(eta), y = max_depth, size = ROC, color = ROC)) +
geom_point() +
theme_bw() +
scale_size_continuous(guide = "none")</code>

Rlogo

Mean and Multi-modal Mean Functions (methods) for Java

When it comes to stats Java ain’t no R. Still, we can do anything in one language that we can do in another.

Let’s have a look at some mean functions for Java, to illustrate:

public static double mean(double[] m) {
     double sum = 0;
     for (int i = 0; i < m.length; i++) {
         sum += m[i];
     }
    return sum / m.length;}
For multi-modal:
public static List<Integer> mode(final int[] a) {
     final List<Integer> modes = new ArrayList<Integer>();
     final Map<Integer, Integer> countMap = new HashMap<Integer, Integer>();

     int max = -1;

     for (final int n : numbers) {
         int count = 0;

         if (countMap.containsKey(n)) {
             count = countMap.get(n) + 1;
         } else {
             count = 1;
         }

         countMap.put(n, count);

         if (count > max) {
             max = count;
         }
     }

     for (final Map.Entry<Integer, Integer> tuple : countMap.entrySet()) {
         if (tuple.getValue() == max) {
             modes.add(tuple.getKey());
         }
     }

     return modes;}

Java

R: Happy Pi Day

Today, 3/14/2015, is Pi Day (see http://piday.org).

In honor of Pi Day, I threw together a little R code on Github, which discusses pi, prints it, and creates Julia set (fractal) images based on it:

https://github.com/hack-r/Rpiday

Happy Pi Day!

pi_fractal