Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions osf/management/commands/reindex_versioned_preprints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import logging
from django.core.management.base import BaseCommand

from osf.models import Preprint

logger = logging.getLogger(__name__)


def reindex_versioned_preprints(dry_run=False, batch_size=100, provider_id=None, guids=None):
Copy link
Collaborator

@cslzchen cslzchen Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is a "backfill" command that fix existing preprints' indexing. Do we have to run this daily (or more frequently) to re-index? If understand this correctly, we should/must also fix the indexing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, new preprints and versions are indexed appropriately.

if guids:
preprints = Preprint.objects.filter(guids___id__in=guids)
else:
preprints = Preprint.objects.filter(versioned_guids__isnull=False).distinct()

if provider_id:
preprints = preprints.filter(provider___id=provider_id)

preprints = preprints.filter(is_published=True)

total_count = preprints.count()
logger.info(f'{"[DRY RUN] " if dry_run else ""}Found {total_count} versioned preprints to re-index')

if total_count == 0:
logger.info('No preprints to re-index')
return

processed = 0
for preprint in preprints.iterator(chunk_size=batch_size):
processed += 1

if dry_run:
logger.info(
f'[DRY RUN] Would re-index preprint {preprint._id} '
f'(version {preprint.versioned_guids.first().version if preprint.versioned_guids.exists() else "N/A"}, '
f'date_created_first_version={preprint.date_created_first_version}) '
f'[{processed}/{total_count}]'
)
else:
try:
preprint.update_search()
if processed % 10 == 0:
logger.info(
f'Re-indexed preprint {preprint._id} '
f'(version {preprint.versioned_guids.first().version if preprint.versioned_guids.exists() else "N/A"}) '
f'[{processed}/{total_count}]'
)
except Exception as e:
logger.error(f'Failed to re-index preprint {preprint._id}: {e}')

logger.info(
f'{"[DRY RUN] " if dry_run else ""}Completed. '
f'{"Would have re-indexed" if dry_run else "Re-indexed"} {processed} preprints'
)


class Command(BaseCommand):
help = (
'Re-index all versioned preprints to Elasticsearch to ensure computed properties '
'like date_created_first_version are up to date.'
)

def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument(
'--dry-run',
action='store_true',
dest='dry_run',
help='Preview what would be re-indexed without actually making changes',
)
parser.add_argument(
'--batch-size',
type=int,
default=100,
help='Number of preprints to process in each batch (default: 100)',
)
parser.add_argument(
'--provider',
type=str,
help='Optional provider ID to filter preprints',
)
parser.add_argument(
'--guids',
type=str,
nargs='+',
help='Optional list of specific preprint GUIDs to re-index',
)

def handle(self, *args, **options):
dry_run = options.get('dry_run', False)
batch_size = options.get('batch_size', 100)
provider_id = options.get('provider')
guids = options.get('guids')

if dry_run:
logger.info('=' * 60)
logger.info('DRY RUN MODE - No changes will be made')
logger.info('=' * 60)

reindex_versioned_preprints(
dry_run=dry_run,
batch_size=batch_size,
provider_id=provider_id,
guids=guids
)
8 changes: 6 additions & 2 deletions osf/models/preprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,14 +383,18 @@ def create(cls, provider, title, creator, description, manual_guid=None, manual_

def get_last_not_rejected_version(self):
"""Get the last version that is not rejected.
Returns None if all versions are rejected.
"""
return self.get_guid().versions.filter(is_rejected=False).order_by('-version').first().referent
last_not_rejected = self.get_guid().versions.filter(is_rejected=False).order_by('-version').first()
return last_not_rejected.referent if last_not_rejected else None

def has_unpublished_pending_version(self):
"""Check if preprint has pending unpublished version.
Note: use `.check_unfinished_or_unpublished_version()` if checking both types
"""
last_not_rejected_version = self.get_last_not_rejected_version()
if not last_not_rejected_version:
return False
return not last_not_rejected_version.date_published and last_not_rejected_version.machine_state == 'pending'

def has_initiated_but_unfinished_version(self):
Expand Down Expand Up @@ -801,7 +805,7 @@ def date_created_first_version(self):
if not base_guid:
return self.created

first_version = base_guid.versions.filter(is_rejected=False).order_by('version').first()
first_version = base_guid.versions.order_by('version').first()
Comment on lines -804 to +808
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So what's the reason behind this change? Does it have to do with the corner case where all versions are rejected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing the filter(is_rejected=False) ensures:

  • We always get the actual first version chronologically (version 1)
  • The code handles the "all versions rejected" case correctly

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see date_created_first_version is a new property in the target pb&s feature branch. And thus this first_version here only affects this new property.


if first_version and first_version.referent:
return first_version.referent.created
Expand Down
32 changes: 32 additions & 0 deletions tests/test_preprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -2701,6 +2701,38 @@ def test_preprint_version_withdrawal_request_post_mod(self, make_withdrawal_requ
assert new_version.is_published is True
assert new_version.machine_state == ReviewStates.WITHDRAWN.value

def test_date_created_first_version_with_rejected_v1(self, creator, moderator):
v1 = PreprintFactory(reviews_workflow='pre-moderation', is_published=False, creator=creator)
v1.run_submit(creator)
v1.run_reject(moderator, 'Rejecting v1')
v1.reload()

assert v1.machine_state == ReviewStates.REJECTED.value
assert v1.versioned_guids.first().is_rejected is True
v1_created = v1.created

v2 = PreprintFactory.create_version(
create_from=v1,
creator=creator,
final_machine_state='initial',
is_published=False,
set_doi=False
)
v2.run_submit(creator)
v2.run_accept(moderator, 'Accepting v2')
v2.reload()

assert v2.machine_state == ReviewStates.ACCEPTED.value
assert v2.is_published is True
v2_created = v2.created

assert v2_created > v1_created

assert v2.date_created_first_version == v1_created
assert v2.date_created_first_version != v2_created

assert v1.date_created_first_version == v1_created


class TestEmberRedirect(OsfTestCase):

Expand Down
Loading