zilencer: Avoid repeated COUNT(*) queries.

Because Django does not support returning the inserted row-ids with a `bulk_create(..., ignore_conflicts=True)`, we previously counted the total rows before and after insertion. This is rather inefficient, and can lead to database contention when many servers are reporting statistics at once. Switch to reaching into the private `_insert` method, which does support what we need. While relying a private method is poor form, it is mildly preferable to attempting to re-implement all of the complexities of it.
2024-07-30 22:06:47 +00:00 · 2024-07-30 22:06:47 +00:00 · 14a2b5473f
parent 82b0732b11
commit 14a2b5473f
1 changed files with 21 additions and 12 deletions
--- a/zilencer/views.py
+++ b/zilencer/views.py
@ -11,6 +11,7 @@ from django.core.exceptions import ValidationError
 from django.core.validators import URLValidator, validate_email
 from django.db import IntegrityError, transaction
 from django.db.models import Model
 from django.db.models.constants import OnConflict
 from django.http import HttpRequest, HttpResponse
 from django.utils.crypto import constant_time_compare
 from django.utils.timezone import now as timezone_now
@ -796,20 +797,28 @@ def batch_create_table_data(
    row_objects: list[ModelT],
 ) -> None:
    # We ignore previously-existing data, in case it was truncated and
-    # re-created on the remote server.  `ignore_conflicts=True`
+    # re-created on the remote server.  Because the existing
-    # cannot return the ids, or count thereof, of the new inserts,
+    # `bulk_create(..., ignore_conflicts=True)` cannot yet return the
-    # (see https://code.djangoproject.com/ticket/0138) so we rely on
+    # ids, or count thereof, of the new inserts, (see
-    # having a lock to accurately count them before and after.  This
+    # https://code.djangoproject.com/ticket/30138), we reach in and
-    # query is also well-indexed.
+    # call _insert with `returning_fields` in batches ourselves.
-    before_count = model._default_manager.filter(server=server).count()
+    inserted_count = 0
-    model._default_manager.bulk_create(row_objects, batch_size=1000, ignore_conflicts=True)
+    expected_count = len(row_objects)
-    after_count = model._default_manager.filter(server=server).count()
+    fields = [f for f in model._meta.fields if f.concrete and not f.generated and f.name != "id"]
-    inserted_count = after_count - before_count
+    while row_objects:
-    if inserted_count < len(row_objects):
+        to_insert, row_objects = row_objects[:1000], row_objects[1000:]
        result = model._default_manager._insert(  # type:ignore[attr-defined]  # This is a private method
            to_insert,
            fields=fields,
            returning_fields=[model._meta.get_field("id")],
            on_conflict=OnConflict.IGNORE,
        )
        inserted_count += len(result)
    if inserted_count < expected_count:
        logging.warning(
            "Dropped %d duplicated rows while saving %d rows of %s for server %s/%s",
-            len(row_objects) - inserted_count,
+            expected_count - inserted_count,
-            len(row_objects),
+            expected_count,
            model._meta.db_table,
            server.hostname,
            server.uuid,