Skip to content

Instantly share code, notes, and snippets.

@jacobian
Created November 4, 2018 14:30
Show Gist options
  • Select an option

  • Save jacobian/4b65b3fbb7103829b941e23c506c822a to your computer and use it in GitHub Desktop.

Select an option

Save jacobian/4b65b3fbb7103829b941e23c506c822a to your computer and use it in GitHub Desktop.

Revisions

  1. jacobian created this gist Nov 4, 2018.
    52 changes: 52 additions & 0 deletions models.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,52 @@
    # https://stackoverflow.com/questions/53139643/django-postgres-array-field-count-number-of-overlaps

    # !!! DOESN'T WORK but might with some more poking?

    class Article(models.Model):
    keywords = ArrayField(models.CharField(max_length=100))

    def __str__(self):
    return f"<Article {self.id} keywords={self.keywords}>"

    def find_similar_articles(self, min_overlap=2):
    # We need the field type to properly cast values below
    kf = self.__class__._meta.get_field("keywords")

    # Construct the inner subquery, which is of the form
    # `SELECT UNNEST(a1) INTERSECT UNNEST(a2)`
    # where a1 and a2 are arrays of keywords.`
    #
    # First step: the left- and right-hand queries, which UNNNEST
    # the keyword field and provided values into rows
    lhs = Article.objects.annotate(tag=Unnest(models.F("keywords")))
    rhs = Article.objects.annotate(tag=Unnest(self.keywords, output_field=kf))

    # Then, we need to make sure to use 'values' to ensure that the
    # subquery returns only a single field.
    lhs = lhs.values("tag")
    rhs = rhs.values("tag")

    # And here's that inner INTERSECT subquery
    intersecting_tags_subquery = models.Subquery(
    lhs.intersection(rhs), output_field=kf
    )

    # Finally, do the query:
    # 1. exclude this row, because it will (by definition) match
    qs = self.__class__.objects.exclude(id=self.id)
    # 2. annotate with the number of matching rows
    qs = qs.annotate(matching_keywords=Array(intersecting_tags_subquery))
    # 3. and find only rows with more than min_overlap matches
    qs = qs.filter(matching_keywords__len__gte=min_overlap)

    return qs


    class Unnest(models.Func):
    function = "unnest"


    class Array(models.Func):
    function = "array"
    arity = 1