2013/01/08(火)Djangoからtextsearch_jaを利用する

2013/01/08 20:32
Djangoからtextsearch_jaを呼び出す場合、素直にやるとORマッパーの恩恵を受けることができず、SQLのWHERE句をゴリゴリ手書きすることになります。 毎回毎回これではちょっと面倒なので、searchメソッドを呼び出すことで全文検索ができるModelを作ってみました。作ったとは言っても、一から作ったわけではなく、Django snippetsにあったtsverctorを操作するモデルにちょこっと手を入れただけですが。手を入れた部分はコメントで補足してあります。
[crayon lang="python"] # -*- coding: utf-8 -*- """ django.db.models.Model を拡張して、PostgreSQLのtsvector型に対応したモデルとマネジャを提供します。 http://djangosnippets.org/snippets/1328/ で公開されているコードを一部修正して使わせていただきました。 以下、原文のコメントです。 -------------------------------------------------- Support for full-text searchable Django models using tsearch2 in PostgreSQL. An example: from search import SearchableModel, SearchManager from django.db import models class TestModel (SearchableModel): name = models.CharField( max_length=100 ) description = models.TextField() # Defining a SearchManager without fields will use all CharFields and TextFields # objects = SearchManager() # You can pass a list of fields that should be indexed # objects = SearchManager( fields=('name','description') ) # You may also specify fields as a dictionary, mapping each field to a weight for ranking purposes # see http://www.postgresql.org/docs/8.3/static/textsearch-features.html#TEXTSEARCH-MANIPULATE-TSVECTOR objects = SearchManager( fields={ 'name': 'A', 'description': 'B', } ) # Create some test data. By default, the index field is automatically updated when save() is called. TestModel.objects.create( name='Model One', description='Hello world, this is a test.' ) TestModel.objects.create( name='Model Two', description='Testing, testing, one two three.' ) # You can force an index update to all or some instances: TestModel.objects.update_index() TestModel.objects.update_index( pk=1 ) TestModel.objects.update_index( pk=[1,2] ) # Perform a search with no ranking TestModel.objects.search( 'hello' ) # Perform a search that ranks the results, orders by the rank, and assigns the ranking # value to the field specified by rank_field TestModel.objects.search( 'test', rank_field='rank' ) """ from django.db import models class VectorField (models.Field): def __init__( self, *args, **kwargs ): kwargs['null'] = True kwargs['editable'] = False kwargs['serialize'] = False super( VectorField, self ).__init__( *args, **kwargs ) def db_type( self, connection=None ): # Django1.4対応のため、仮引数connectionを追加 return 'tsvector' class SearchableModel (models.Model): """ A convience Model wrapper that provides an update_index method for object instances, as well as automatic index updating. The index is stored as a tsvector column on the model's table. A model may specify a boolean class variable, _auto_reindex, to control whether the index is automatically updated when save is called. """ search_index = VectorField() class Meta: abstract = True def update_index( self ): if hasattr( self, '_search_manager' ): self._search_manager.update_index( pk=self.pk ) def save( self, *args, **kwargs ): super( SearchableModel, self ).save( *args, **kwargs ) if hasattr( self, '_auto_reindex' ): if self._auto_reindex: self.update_index() else: self.update_index() class SearchManager (models.Manager): def __init__( self, fields=None, config=None ): self.fields = fields self.default_weight = 'A' self.config = config and config or 'pg_catalog.english' self._vector_field_cache = None super( SearchManager, self ).__init__() def contribute_to_class( self, cls, name ): # Instances need to get to us to update their indexes. setattr( cls, '_search_manager', self ) super( SearchManager, self ).contribute_to_class( cls, name ) def _find_text_fields( self ): """ Return the names of all CharField and TextField fields defined for this manager's model. """ fields = [f for f in self.model._meta.fields if isinstance(f,(models.CharField,models.TextField))] return [f.name for f in fields] def _vector_field( self ): """ Returns the VectorField defined for this manager's model. There must be exactly one VectorField defined. """ if self._vector_field_cache is not None: return self._vector_field_cache vectors = [f for f in self.model._meta.fields if isinstance(f,VectorField)] if len(vectors) != 1: raise ValueError( "There must be exactly 1 VectorField defined for the %s model." % self.model._meta.object_name ) self._vector_field_cache = vectors[0] return self._vector_field_cache vector_field = property( _vector_field ) def _vector_sql( self, field, weight=None ): """ Returns the SQL used to build a tsvector from the given (django) field name. """ if weight is None: weight = self.default_weight f = self.model._meta.get_field( field ) return "setweight( to_tsvector( '%s', coalesce(\"%s\",'') ), '%s' )" % (self.config, f.column, weight) def update_index( self, pk=None ): """ Updates the full-text index for one, many, or all instances of this manager's model. """ from django.db import connection # Build a list of SQL clauses that generate tsvectors for each specified field. clauses = [] if self.fields is None: self.fields = self._find_text_fields() if isinstance( self.fields, (list,tuple) ): for field in self.fields: clauses.append( self._vector_sql(field) ) else: for field, weight in self.fields.items(): clauses.append( self._vector_sql(field,weight) ) vector_sql = ' || '.join( clauses ) where = '' # If one or more pks are specified, tack a WHERE clause onto the SQL. if pk is not None: if isinstance( pk, (list,tuple) ): ids = ','.join( [str(v) for v in pk] ) where = " WHERE \"%s\" IN (%s)" % (self.model._meta.pk.column, ids) else: where = " WHERE \"%s\" = %s" % (self.model._meta.pk.column, pk) sql = "UPDATE \"%s\" SET \"%s\" = %s%s;" % (self.model._meta.db_table, self.vector_field.column, vector_sql, where) cursor = connection.cursor() cursor.execute( sql ) # cursor.execute( "COMMIT;" ) # TransactionMiddleware対応のため削除 # cursor.close() # TransactionMiddleware対応のため削除 def search( self, query, rank_field=None, rank_normalization=32, use_web_query=False ): # web_query対応のため引数追加 """ Returns a queryset after having applied the full-text search query. If rank_field is specified, it is the name of the field that will be put on each returned instance. When specifying a rank_field, the results will automatically be ordered by -rank_field. For possible rank_normalization values, refer to: http://www.postgresql.org/docs/8.3/static/textsearch-controls.html#TEXTSEARCH-RANKING """ # web_query対応のため修正 # ts_query = "to_tsquery('%s','%s')" % (self.config, unicode(query).replace("'","''")) if use_web_query: to_tsquery_string = "to_tsquery('%s',web_query('%s'))" else: to_tsquery_string = "to_tsquery('%s','%s')" ts_query = to_tsquery_string % (self.config, unicode(query).replace("'","''")) where = "\"%s\" @@ %s" % (self.vector_field.column, ts_query) select = {} order = [] if rank_field is not None: select[rank_field] = 'ts_rank( "%s", %s, %d )' % (self.vector_field.column, ts_query, rank_normalization) order = ['-%s' % rank_field] return self.all().extra( select=select, where=[where], order_by=order ) [/crayon]
呼び出す側では、searchメソッドに条件やrankに利用するフィールド名を渡すことで全文検索が実行できます。
[crayon lang="python"] result = Spam.objects.search(query='俺達 炎上', rank_field='rank', use_web_query=True) [/crayon]
`use_web_query`に`True`を渡すことで、textsearch_jaのweb_queryの機能を使うことができ、スペース区切りの各文字列をAND条件として扱ったり、`OR`で結ぶことでOR条件として扱うことができます。web_queryの詳細は[公式サイト](http://textsearch-ja.projects.pgfoundry.org/textsearch_ja.html#web_query)で。 また、searchメソッドだけではなく、別の条件を組み合わせることもできます。
[crayon lang="python"] result = Spam.objects.search(query='検索条件', rank_field='rank', use_web_query=True).filter(enable=True) [/crayon]
これでだいぶスマートにtextsearch_jaを扱うことができるようになりました。まぁ、95%以上のコードは俺じゃなくて[dcwatsonさん](http://djangosnippets.org/users/dcwatson/)が書いたんだけど。