I am doing text classification with Python and sklearn. I have some custom functions that I use in addition to vectorizers. I would like to know if they can be used with sklearn Pipeline and how the functions will be combined in it.
A short example of my current classification code without Pipeline. Please tell me if you see that something is wrong with this, we will be very grateful for your help. Is there any way to use it with a sklearn pipeline? I created my own get_features () function, which extracts custom functions, converts a vectorizer, scales the functions, and finally adds them all.
import sklearn.svm
import re
from sklearn import metrics
import numpy
import scipy.sparse
import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import StandardScaler
def words_capitalized(sentence):
tokens = []
tokens = word_tokenize(sentence)
counter = 0
for word in tokens:
if word[0].isupper():
counter += 1
return counter
def words_length(sentence):
tokens = []
tokens = word_tokenize(sentence)
list_of_length = list()
for word in tokens:
list_of_length.append(length(word))
return list_of_length
def get_features(untagged_text, value, scaler):
list_of_length = list()
list_of_capitals = list()
X_bow = countVecWord.transform(untagged_text)
for sentence in untagged_text:
list_of_urls.append([words_length(sentence)])
list_of_capitals.append([words_capitalized(sentence)])
X_length = numpy.array(list_of_urls)
X_capitals = numpy.array(list_of_capitals)
if value == 1:
X_length = = scaler.fit_transform(X_length)
X_capitals = scaler.fit_transform(X_capitals)
else:
X_length = = scaler.transform(X_length)
X_capitals = scaler.transform(X_capitals)
X_two_bows = scipy.sparse.hstack((X_bow, X_length))
X_two_bows = scipy.sparse.hstack((X_two_bows , X_length))
X_two_bows = scipy.sparse.hstack((X_two_bows , X_capitals))
return X_two_bows
def fit_and_predict(train_labels, train_features, test_features, classifier):
classifier.fit(train_features, train_labels)
return classifier.predict(test_features)
if __name__ == '__main__':
input_sets = read_data()
X = input_sets[0]
Y = input_sets[1]
X_dev = input_sets[2]
Y_dev = input_sets[3]
countVecWord = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1, 3))
scaler= StandardScaler()
X_total = get_features(X, 1, scaler)
X_total_dev = get_features(X_dev, 2, scaler)
y_train = numpy.asarray(Y)
y_dev = numpy.asarray(Y_dev)
SVC1 = LinearSVC(C = 1.0)
y_predicted = list()
y_predicted = fit_and_predict(y_train, X_total, X_total_dev, SVC1)
print "Result for dev set"
precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_dev, y_predicted)
print "Precision: ", precision, " Recall: ", recall, " F1-Score: ", f1
, FeatureUnion, , hstack .
EDIT: : https://michelleful.imtqy.com/code-blog/2015/06/20/pipelines/
, , . , Pipelines.