I want to build this example in a scatter chart:
http://scikit-learn.org/dev/auto_examples/document_clustering.html#example-document-clustering-py
I am sklearn and numpy new here, I want to get vector coordinate data so that I can build.
EDIT:
Here is what I got so far:
''' Created on Apr 4, 2013 @author: v3ss ''' from classify import recursive_load_files from time import time import numpy as np import pylab as pl from sklearn import metrics from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.preprocessing import scale from sklearn.cluster import KMeans, MiniBatchKMeans from os.path import isdir from os import listdir from os.path import join from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import Perceptron, RidgeClassifier, SGDClassifier from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.svm import LinearSVC from sklearn.decomposition import RandomizedPCA from sklearn.utils.validation import check_random_state from time import time import numpy as np import os import traceback def clustering_from_files(trainer_path = "./dataset/dataset/training_data/"): classifier = "NB" load_files = recursive_load_files trainer_path = os.path.realpath(trainer_path) data_train = load_files(trainer_path, load_content = True, shuffle = False) print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.7, stop_words='english',charset_error="ignore") X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "Targets:",data_train.target km = MiniBatchKMeans(n_clusters=15, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=1)

EDIT:
Now it works better, can increase cluster size.
source share