Build a multi-headed model that’s capable of detecting different types of of toxicity like threats, obscenity, insults, and identity-based hate
This project is launched for Kaggle Competition: Toxic Comment Classification Challenge — build a multi-headed model that’s capable of detecting different types of of toxicity like threats, obscenity, insults, and identity-based hate.
This is my first serious kaggle game and thus made a lot of mistakes at the begining. After two-month soloing the game, I found it more important to learn than struggle in LB.
I trained 35 models with 5-fold cv for RNN/RCNN/Capsule and 10-fold cv for CNN. Because of limitation in submission I only evaluates some of them with LB, the final submission blends all 35 models and some kernels on Kaggle. They together push me to top 3%.
Model Description | Embedding | Preprocessing | k-folds | LB scores |
---|---|---|---|---|
rcnn | glove.840B.300d | Deep prcessing and “pre” padding/truncating | 5 | 0.9865 |
rcnn | glove.840B.300d | Deep prcessing and “post” padding/truncating | 5 | 0.9865 |
rcnn | glove.840B.300d | Soft prcessing and “pre” padding/truncating | 5 | 0.9861 |
rcnn | fasttext-english.300d | Soft prcessing and “post” padding/truncating | 5 | 0.9861 |
rcnn | fasttext-crawl-300d-2M | Deep prcessing and “pre” padding/truncating | 5 | 0.9859 |
gru | fasttext-crawl-300d-2M | Soft prcessing and “pre” padding/truncating | 5 | 0.985 |
gru | fasttext-crawl-300d-2M | Deep prcessing and “post” padding/truncating | 5 | 0.9848 |
gru(rm fc layers) | glove.840B.300d | Soft prcessing and “post” padding/truncating | 5 | 0.9861 |
lstm | fasttext-crawl-300d-2M | Soft prcessing and “pre” padding/truncating | 5 | 0.9845 |
cnn | glove.840B.300d | Deep prcessing and “pre” padding/truncating | 10 | 0.9842 |
mvcnn | fasttext-crawl-300d-2M glove.840B.300d | Deep prcessing and “post” padding/truncating | 10 | 0.9849 |
mvcnn | fasttext-english.300d glove.840B.300d | Soft prcessing and “post” padding/truncating | 10 | 0.9831 |
mvcnn | fasttext-crawl-300d-2M glove.840B.300d google-word2vec | Deep prcessing and “post” padding/truncating | 10 | 0.9849 |
capsule | fasttext-crawl-300d-2M | Deep prcessing and “post” padding/truncating | 5 | 0.9859 |
capsule | glove.840B.300d | Soft prcessing and “post” padding/truncating | 5 | 0.9856 |
capsule | fasttext-crawl-300d-2M | Deep prcessing and “pre” padding/truncating | 5 | 0.9854 |
2d cnn | glove.840B.300d | Deep prcessing and “post” padding/truncating | 10 | 0.9851 |
dpcnn | glove.840B.300d | Deep prcessing and “post” padding/truncating | 10 | 0.9861 |
dpcnn | fasttext-crawl-300d-2M | Deep prcessing and “pre” padding/truncating | 10 | 0.9850 |
Refer to here for RCNN, RNN, capsule NN and CNN code.
Refer to here for Multi Channel Variable size CNN (MVCNN), 2D CNN and Deep Pyramid Convolutional Neural Networks(dpcnn).
Refer to here for Conv layer + RNN.
DCNN: Refer to Kalchbrenner et al (2014). I am still working on testing the models. Please refer bicepjai for code.
def charrnn(char_num, num_classes, max_seq_len, filter_sizes=[3, 4, 5, 6, 7], rnn_dim = 128, num_filters=64, l2_weight_decay=0.0001, dropout_val=0.25, dense_dim=32, auxiliary = False, dropout=0.2, recurrent_dropout=0.2, add_sigmoid=True, train_embeds=False, gpus=0, add_embeds=True, rnn_type='gru'):
if rnn_type == 'lstm':
RNN = CuDNNLSTM if gpus > 0 else LSTM
elif rnn_type == 'gru':
RNN = CuDNNGRU if gpus > 0 else GRU
else:
RNN = CuDNNLSTM if gpus > 0 else LSTM
input_ = Input(shape=(max_seq_len,))
x = Embedding(char_num + 1, 300)(input_)
x = SpatialDropout1D(dropout_val)(x)
convs = []
for filter_size in filter_sizes:
l_conv = Conv1D(filters=128, kernel_size=filter_size, padding='valid', activation='relu')(x)
l_pool = MaxPooling1D(filter_size)(l_conv)
convs.append(l_pool)
x = Concatenate(axis=1)(convs)
x = Bidirectional(RNN(rnn_dim, return_sequences = True, dropout=dropout, recurrent_dropout=recurrent_dropout))(x)
x = Conv1D(num_filters, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
if auxiliary:
auxiliary_input = Input(shape=(5,), name='aux_input')
x = Concatenate()([x, auxiliary_input])
x = Dense(num_classes, activation = "sigmoid")(x)
if auxiliary:
model = Model(inputs=[input_, auxiliary_input], outputs=x)
else:
model = Model(inputs=input_, outputs=x)
if gpus > 0:
model = multi_gpu_model(model, gpus=gpus)
return model
def mulrnn(embedding_matrix, num_classes, max_seq_len, l2_weight_decay=0.0001, rnn_dim=100, dropout_val=0.3, dense_dim=32, add_sigmoid=True, train_embeds=False, gpus=0, rnn_type='lstm', mask_zero=True, auxiliary=True, kernel_regularizer=None, recurrent_regularizer=None, activity_regularizer=None, dropout=0.2, recurrent_dropout=0.2):
input_ = Input(shape=(max_seq_len,))
embeds = Embedding(embedding_matrix.shape[0],
embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=max_seq_len,
trainable=train_embeds)(input_)
x = MultiplicativeLSTM(32)(embeds)
output = Dense(num_classes, activation="sigmoid")(x)
model = Model(inputs=input_, outputs=output)
if gpus > 0:
model = multi_gpu_model(model, gpus=gpus)
return model
def vdnn(embedding_matrix, num_classes, max_seq_len, num_filters=2, filter_sizes=[64, 128, 256, 512], l2_weight_decay=0.0001, dropout_val=0.5,
dense_dim=32, add_sigmoid=True, train_embeds=False, auxiliary=True, gpus=0, n_cnn_layers=1, pool='max',
add_embeds=False):
#input_ = Input(shape=(max_seq_len,))
model = Sequential([
Embedding(embedding_matrix.shape[0],
embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=max_seq_len,
trainable=train_embeds),
Conv1D(embedding_matrix.shape[1], 3, padding="valid")
])
# 4 pairs of convolution blocks followed by pooling
for filter_size in filter_sizes:
# each iteration is a convolution block
for cb_i in range(num_filters):
model.add(Conv1D(filter_size, 3, padding="same"))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Conv1D(filter_size, 3, padding="same"))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPooling1D(pool_size=2, strides=3))
# model.add(KMaxPooling(k=2))
model.add(Flatten())
model.add(Dense(1024, activation="relu"))
model.add(Dense(256, activation="relu"))
if add_sigmoid:
model.add(Dense(num_classes, activation='sigmoid'))
if gpus > 0:
model = multi_gpu_model(model, gpus=gpus)
return model