Creating a gin index using Trigram (gin_trgm_ops) in a Django model

Question

Creating a gin index using Trigram (gin_trgm_ops) in a Django model

The new TrigramS Similarity feature in django.contrib.postgres worked fine for my problem. I use it for the search bar to find hard-to-pronounce Latin names. The problem is that there are more than 2 million names, and the search takes longer than I want.

I would like to create an index for trigrams as described in the postgres documentation https://www.postgresql.org/docs/9.6/static/pgtrgm.html

But I'm not sure how to make it so that the Django API uses it. For postgres text search, there is a description of how to create an index, but not the similarity of trigrams. https://docs.djangoproject.com/en/1.11/ref/contrib/postgres/search/#performance

This is what I have now:

class NCBI_names(models.Model): tax_id = models.ForeignKey(NCBI_nodes, on_delete=models.CASCADE, default = 0) name_txt = models.CharField(max_length=255, default = '') name_class = models.CharField(max_length=32, db_index=True, default = '') class Meta: indexes = [GinIndex(fields=['name_txt'])]

In the get_queryset method:

 class TaxonSearchListView(ListView): #form_class=TaxonSearchForm template_name='collectie/taxon_list.html' paginate_by=20 model=NCBI_names context_object_name = 'taxon_list' def dispatch(self, request, *args, **kwargs): query = request.GET.get('q') if query: try: tax_id = self.model.objects.get(name_txt__iexact=query).tax_id.tax_id return redirect('collectie:taxon_detail', tax_id) except (self.model.DoesNotExist, self.model.MultipleObjectsReturned) as e: return super(TaxonSearchListView, self).dispatch(request, *args, **kwargs) else: return super(TaxonSearchListView, self).dispatch(request, *args, **kwargs) def get_queryset(self): result = super(TaxonSearchListView, self).get_queryset() # query = self.request.GET.get('q') if query: result = result.exclude(name_txt__icontains = 'sp.') result = result.annotate(similarity=TrigramSimilarity('name_txt', query)).filter(similarity__gt=0.3).order_by('-similarity') return result

+10

python django indexing postgresql similarity

Allcor Jun 29 '17 at 8:46

source share

4 answers

Inspired by an old article on this topic, I got into the current one, which gives the following solution for GistIndex :

Update: from Django-1.11, everything seems simpler, as this answer and django docs sugest:

 from django.contrib.postgres.indexes import GinIndex class MyModel(models.Model): the_field = models.CharField(max_length=512, db_index=True) class Meta: indexes = [GinIndex(fields=['the_field'])]

Starting with Django-2.2 , attribute opclasses will be available for class Index(fields=(), name=None, db_tablespace=None, opclasses=()) for this purpose.

 from django.contrib.postgres.indexes import GistIndex class GistIndexTrgrmOps(GistIndex): def create_sql(self, model, schema_editor): # - this Statement is instantiated by the _create_index_sql() # method of django.db.backends.base.schema.BaseDatabaseSchemaEditor. # using sql_create_index template from # django.db.backends.postgresql.schema.DatabaseSchemaEditor # - the template has original value: # "CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s)%(extra)s" statement = super().create_sql(model, schema_editor) # - however, we want to use a GIST index to accelerate trigram # matching, so we want to add the gist_trgm_ops index operator # class # - so we replace the template with: # "CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s gist_trgrm_ops)%(extra)s" statement.template =\ "CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s gist_trgm_ops)%(extra)s" return statement

Which you can then use in your class models as follows:

 class YourModel(models.Model): some_field = models.TextField(...) class Meta: indexes = [ GistIndexTrgrmOps(fields=['some_field']) ]

+4

raratiru Aug 16 '18 at 15:31

source share

If someone wants to have an index for multiple columns connected to a space, you can use my modicitaion inline index.

Creates an index, for example gin (("column1" || ' ' || "column2" || ' ' || ...) gin_trgm_ops)

 class GinSpaceConcatIndex(GinIndex): def get_sql_create_template_values(self, model, schema_editor, using): fields = [model._meta.get_field(field_name) for field_name, order in self.fields_orders] tablespace_sql = schema_editor._get_index_tablespace_sql(model, fields) quote_name = schema_editor.quote_name columns = [ ('%s %s' % (quote_name(field.column), order)).strip() for field, (field_name, order) in zip(fields, self.fields_orders) ] return { 'table': quote_name(model._meta.db_table), 'name': quote_name(self.name), 'columns': "({}) gin_trgm_ops".format(" || ' ' || ".join(columns)), 'using': using, 'extra': tablespace_sql, }

+2

n1_ Dec 18 '17 at 7:46

source share

To make Django 2.2, use the pointer for icontains and similar searches:

Subclass of GinIndex:

 from django.contrib.postgres.indexes import GinIndex class UpperGinIndex(GinIndex): def create_sql(self, model, schema_editor, using=''): statement = super().create_sql(model, schema_editor, using=using) quote_name = statement.parts['columns'].quote_name def upper_quoted(column): return f'UPPER({quote_name(column)})' statement.parts['columns'].quote_name = upper_quoted return statement

Add an index to your model as follows, including the name kwarg, which is required when using opclasses :

 class MyModel(Model): name = TextField(...) class Meta: indexes = [ UpperGinIndex(fields=['name'], name='mymodel_name_gintrgm', opclasses=['gin_trgm_ops']) ]

Generate the migration and edit the generated file:

 # Generated by Django 2.2.3 on 2019-07-15 10:46 from django.contrib.postgres.operations import TrigramExtension # <<< add this from django.db import migrations import myapp.models class Migration(migrations.Migration): operations = [ TrigramExtension(), # <<< add this migrations.AddIndex( model_name='mymodel', index=myapp.models.UpperGinIndex(fields=['name'], name='mymodel_name_gintrgm', opclasses=['gin_trgm_ops']), ), ]

+2

Risadinha Jul 15 '19 at 11:26

source share

TimB · Accepted Answer · 2017-07-07T05:08:04+0000

I had a similar problem trying to use the pg_tgrm extension to support efficient searches for contains and icontains Django fields.

There might be a more elegant way, but defining a new index type like me worked for me:

 from django.contrib.postgres.indexes import GinIndex class TrigramIndex(GinIndex): def get_sql_create_template_values(self, model, schema_editor, using): fields = [model._meta.get_field(field_name) for field_name, order in self.fields_orders] tablespace_sql = schema_editor._get_index_tablespace_sql(model, fields) quote_name = schema_editor.quote_name columns = [ ('%s %s' % (quote_name(field.column), order)).strip() + ' gin_trgm_ops' for field, (field_name, order) in zip(fields, self.fields_orders) ] return { 'table': quote_name(model._meta.db_table), 'name': quote_name(self.name), 'columns': ', '.join(columns), 'using': using, 'extra': tablespace_sql, }

The get_sql_create_template_values method get_sql_create_template_values copied from Index.get_sql_create_template_values() with only one modification: adding + ' gin_trgm_ops' .

For your use case, you then define an index on name_txt using this TrigramIndex instead of GinIndex . Then run makemigrations , which will result in a migration that generates the required CREATE INDEX SQL.

UPDATE:

I see that you are also executing a query using icontains :

 result.exclude(name_txt__icontains = 'sp.')

The Postgresql backend will turn this into something like this:

 UPPER("NCBI_names"."name_txt"::text) LIKE UPPER('sp.')

and then the trigram index will not be used due to UPPER() .

I had the same problem and ended up subclassing the database server to get around it:

 from django.db.backends.postgresql import base, operations class DatabaseFeatures(base.DatabaseFeatures): pass class DatabaseOperations(operations.DatabaseOperations): def lookup_cast(self, lookup_type, internal_type=None): lookup = '%s' # Cast text lookups to text to allow things like filter(x__contains=4) if lookup_type in ('iexact', 'contains', 'icontains', 'startswith', 'istartswith', 'endswith', 'iendswith', 'regex', 'iregex'): if internal_type in ('IPAddressField', 'GenericIPAddressField'): lookup = "HOST(%s)" else: lookup = "%s::text" return lookup class DatabaseWrapper(base.DatabaseWrapper): """ Override the defaults where needed to allow use of trigram index """ ops_class = DatabaseOperations def __init__(self, *args, **kwargs): self.operators.update({ 'icontains': 'ILIKE %s', 'istartswith': 'ILIKE %s', 'iendswith': 'ILIKE %s', }) self.pattern_ops.update({ 'icontains': "ILIKE '%%' || {} || '%%'", 'istartswith': "ILIKE {} || '%%'", 'iendswith': "ILIKE '%%' || {}", }) super(DatabaseWrapper, self).__init__(*args, **kwargs)

Creating a gin index using Trigram (gin_trgm_ops) in a Django model

More articles: