Assign all datasets to DatasetVersions based on legacy data.
Intended for use in a data migration. Does the following:
- Assign datasets to same DatasetVersions as other datasets in dataset_json.dataset_version_set
- Add id of each dataset in dataset_version_set to DatasetVersions.legacy_versions
Source code in src/apps/core/models/legacy_versions.py
| def migrate_dataset_versions(apps=django_apps):
"""Assign all datasets to DatasetVersions based on legacy data.
Intended for use in a data migration. Does the following:
- Assign datasets to same DatasetVersions as other datasets in dataset_json.dataset_version_set
- Add id of each dataset in dataset_version_set to DatasetVersions.legacy_versions
"""
versions_model = apps.get_model("core", "DatasetVersions")
legacy_dataset_model = apps.get_model("core", "LegacyDataset")
dataset_model = apps.get_model("core", "Dataset")
# Collect all datasets and version sets by their identifiers
datasets_by_id = dataset_model.objects.only("id", "dataset_versions_id").in_bulk()
version_sets_by_id = versions_model.objects.in_bulk()
# Get all legacy dataset_version_set lists
legacy_versions_data = legacy_dataset_model.objects.filter(
dataset_json__dataset_version_set__isnull=False
).values(
identifier=models.F("dataset_json__identifier"),
next_draft=models.F("dataset_json__next_draft__identifier"),
draft_of=models.F("dataset_json__draft_of__identifier"),
version_set=models.F("dataset_json__dataset_version_set"),
)
# Collect identifiers, merge all sets containing at least one common dataset
legacy_version_data_ids = []
for version_data in legacy_versions_data:
ids = [UUID(version["identifier"]) for version in version_data["version_set"]]
# Draft dataset isn't listed in its own version_set in V2
# so we add the identifier manually just in case
if identifier := version_data["identifier"]:
ids.append(UUID(identifier))
if next_draft := version_data["next_draft"]:
ids.append(UUID(next_draft))
if draft_of := version_data["draft_of"]:
ids.append(UUID(draft_of))
legacy_version_data_ids.append(ids)
legacy_version_sets = merge_sets(legacy_version_data_ids)
dataset_updates = []
new_version_sets = []
for version_set in legacy_version_sets:
_assign_dataset_version(
version_set=version_set,
datasets_by_id=datasets_by_id,
version_sets_by_id=version_sets_by_id,
new_dataset_versions=new_version_sets,
dataset_updates=dataset_updates,
versions_model=versions_model,
)
versions_model.objects.bulk_create(new_version_sets)
versions_model.objects.bulk_update(version_sets_by_id.values(), fields=["legacy_versions"])
dataset_model.objects.bulk_update(dataset_updates, fields=["dataset_versions_id"])
versions_model.objects.filter(
datasets__isnull=True
).delete() # remove versions without datasets
|