How to use Sklearn pipeline with custom functions?

I am doing text classification with Python and sklearn. I have some custom functions that I use in addition to vectorizers. I would like to know if they can be used with sklearn Pipeline and how the functions will be combined in it.

A short example of my current classification code without Pipeline. Please tell me if you see that something is wrong with this, we will be very grateful for your help. Is there any way to use it with a sklearn pipeline? I created my own get_features () function, which extracts custom functions, converts a vectorizer, scales the functions, and finally adds them all.

import sklearn.svm
import re
from sklearn import metrics
import numpy
import scipy.sparse
import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import StandardScaler

# custom feature example
def words_capitalized(sentence):
    tokens = []
    # tokenize the sentence
    tokens = word_tokenize(sentence)

    counter = 0
    for word in tokens:

        if word[0].isupper():
            counter += 1

    return counter

# custom feature example
def words_length(sentence):
    tokens = []
    # tokenize the sentence
    tokens = word_tokenize(sentence)

    list_of_length = list()
    for word in tokens:
        list_of_length.append(length(word))

    return list_of_length

def get_features(untagged_text, value, scaler):

    # this function extracts the custom features
    # transforms the vectorizer
    # scales the features
    # and finally stacks all of them

    list_of_length = list()
    list_of_capitals = list()

    # transform vectorizer
    X_bow = countVecWord.transform(untagged_text)

    # I also see some people use X_bow = countVecWord.transform(untagged_text).todense(), what does the .todense() option do here?

    for sentence in untagged_text:
        list_of_urls.append([words_length(sentence)])
        list_of_capitals.append([words_capitalized(sentence)])

    # turn the feature output into a numpy vector
    X_length = numpy.array(list_of_urls)
    X_capitals = numpy.array(list_of_capitals)

    if value == 1:
        # fit transform for training set
        X_length = = scaler.fit_transform(X_length)
        X_capitals = scaler.fit_transform(X_capitals)
    # if test set
    else:
        # transform only for test set
        X_length = = scaler.transform(X_length)
        X_capitals = scaler.transform(X_capitals)

    # stack all features as a sparse matrix
    X_two_bows = scipy.sparse.hstack((X_bow, X_length))
    X_two_bows = scipy.sparse.hstack((X_two_bows , X_length))
    X_two_bows = scipy.sparse.hstack((X_two_bows , X_capitals))

    return X_two_bows

def fit_and_predict(train_labels, train_features, test_features, classifier):

    # fit the training set
    classifier.fit(train_features, train_labels)

    # return the classification result
    return classifier.predict(test_features)

if  __name__ == '__main__':

    input_sets = read_data()

    X = input_sets[0] 
    Y = input_sets[1] 
    X_dev = input_sets[2] 
    Y_dev = input_sets[3] 

    # initialize the count vectorizer
    countVecWord = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1, 3))

    scaler= StandardScaler()

    # extract features

    # for training
    X_total = get_features(X, 1, scaler)

    # for dev set
    X_total_dev = get_features(X_dev,  2, scaler)

    # store labels as numpy array
    y_train = numpy.asarray(Y)
    y_dev = numpy.asarray(Y_dev)

    # train the classifier
    SVC1 = LinearSVC(C = 1.0)

    y_predicted = list()
    y_predicted = fit_and_predict(y_train, X_total, X_total_dev, SVC1)

    print "Result for dev set"
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_dev, y_predicted)
    print "Precision: ", precision, " Recall: ", recall, " F1-Score: ", f1

, FeatureUnion, , hstack .

EDIT: : https://michelleful.imtqy.com/code-blog/2015/06/20/pipelines/

, , . , Pipelines.

+4
1

, , , FeatureUnion. > sklearn?

+2

All Articles