zilencer: Avoid repeated COUNT(*) queries.

Because Django does not support returning the inserted row-ids with a
`bulk_create(..., ignore_conflicts=True)`, we previously counted the
total rows before and after insertion.  This is rather inefficient,
and can lead to database contention when many servers are reporting
statistics at once.

Switch to reaching into the private `_insert` method, which does
support what we need.  While relying a private method is poor form, it
is mildly preferable to attempting to re-implement all of the
complexities of it.
This commit is contained in:
Alex Vandiver 2024-07-30 22:06:47 +00:00 committed by Tim Abbott
parent 82b0732b11
commit 14a2b5473f
1 changed files with 21 additions and 12 deletions

View File

@ -11,6 +11,7 @@ from django.core.exceptions import ValidationError
from django.core.validators import URLValidator, validate_email from django.core.validators import URLValidator, validate_email
from django.db import IntegrityError, transaction from django.db import IntegrityError, transaction
from django.db.models import Model from django.db.models import Model
from django.db.models.constants import OnConflict
from django.http import HttpRequest, HttpResponse from django.http import HttpRequest, HttpResponse
from django.utils.crypto import constant_time_compare from django.utils.crypto import constant_time_compare
from django.utils.timezone import now as timezone_now from django.utils.timezone import now as timezone_now
@ -796,20 +797,28 @@ def batch_create_table_data(
row_objects: list[ModelT], row_objects: list[ModelT],
) -> None: ) -> None:
# We ignore previously-existing data, in case it was truncated and # We ignore previously-existing data, in case it was truncated and
# re-created on the remote server. `ignore_conflicts=True` # re-created on the remote server. Because the existing
# cannot return the ids, or count thereof, of the new inserts, # `bulk_create(..., ignore_conflicts=True)` cannot yet return the
# (see https://code.djangoproject.com/ticket/0138) so we rely on # ids, or count thereof, of the new inserts, (see
# having a lock to accurately count them before and after. This # https://code.djangoproject.com/ticket/30138), we reach in and
# query is also well-indexed. # call _insert with `returning_fields` in batches ourselves.
before_count = model._default_manager.filter(server=server).count() inserted_count = 0
model._default_manager.bulk_create(row_objects, batch_size=1000, ignore_conflicts=True) expected_count = len(row_objects)
after_count = model._default_manager.filter(server=server).count() fields = [f for f in model._meta.fields if f.concrete and not f.generated and f.name != "id"]
inserted_count = after_count - before_count while row_objects:
if inserted_count < len(row_objects): to_insert, row_objects = row_objects[:1000], row_objects[1000:]
result = model._default_manager._insert( # type:ignore[attr-defined] # This is a private method
to_insert,
fields=fields,
returning_fields=[model._meta.get_field("id")],
on_conflict=OnConflict.IGNORE,
)
inserted_count += len(result)
if inserted_count < expected_count:
logging.warning( logging.warning(
"Dropped %d duplicated rows while saving %d rows of %s for server %s/%s", "Dropped %d duplicated rows while saving %d rows of %s for server %s/%s",
len(row_objects) - inserted_count, expected_count - inserted_count,
len(row_objects), expected_count,
model._meta.db_table, model._meta.db_table,
server.hostname, server.hostname,
server.uuid, server.uuid,