Skip to content

legacy#

IncompatibleAPIVersion#

Bases: APIException

Source code in src/apps/core/models/legacy.py
class IncompatibleAPIVersion(exceptions.APIException):
    status_code = 400

add_escapes#

Source code in src/apps/core/models/legacy.py
def add_escapes(val: str):
    val = val.replace("[", "\\[")
    return val.replace("]", "\\]")

regex#

Escape [ and ] and compile into regex.

Source code in src/apps/core/models/legacy.py
def regex(path: str):
    """Escape [ and ] and compile into regex."""
    return re.compile(add_escapes(path))

LegacyDataset#

Bases: Dataset

Migrated V1 and V2 Datasets

Stores legacy dataset json fields and derives v3 dataset fields from them when update_from_legacy is called.

Attributes:

Name Type Description
dataset_json JSONField

V1/V2 dataset json from legacy metax dataset API

contract_json JSONField

Contract json for which the dataset is under

files_json JSONField

Files attached to the dataset trough dataset/files API in v2

v2_dataset_compatibility_diff JSONField

Difference between v1-v2 and V3 dataset json

Source code in src/apps/core/models/legacy.py
class LegacyDataset(Dataset):
    """Migrated V1 and V2 Datasets

    Stores legacy dataset json fields and derives v3 dataset
    fields from them when update_from_legacy is called.

    Attributes:
        dataset_json (models.JSONField): V1/V2 dataset json from legacy metax dataset API
        contract_json (models.JSONField): Contract json for which the dataset is under
        files_json (models.JSONField): Files attached to the dataset trough dataset/files API in v2
        v2_dataset_compatibility_diff (models.JSONField):
            Difference between v1-v2 and V3 dataset json
    """

    dataset = models.OneToOneField(
        Dataset,
        on_delete=models.CASCADE,
        parent_link=True,
        primary_key=True,
    )
    dataset_json = models.JSONField(encoder=DjangoJSONEncoder)
    contract_json = models.JSONField(null=True, blank=True, encoder=DjangoJSONEncoder)
    files_json = models.JSONField(null=True, blank=True, encoder=DjangoJSONEncoder)
    v2_dataset_compatibility_diff = models.JSONField(
        null=True,
        blank=True,
        encoder=DjangoJSONEncoder,
        help_text="Difference between v1-v2 and V3 dataset json",
    )
    migration_errors = models.JSONField(null=True, blank=True, encoder=DjangoJSONEncoder)
    last_successful_migration = models.DateTimeField(null=True, blank=True)
    invalid_legacy_values = models.JSONField(null=True, blank=True, encoder=DjangoJSONEncoder)
    fixed_legacy_values = models.JSONField(null=True, blank=True, encoder=DjangoJSONEncoder)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.created_objects = Counter()

        # If api_version was not provided, replace the field default with value from dataset_json
        if not is_field_value_provided(self.__class__, "api_version", args, kwargs):
            self.api_version = self.dataset_json.get("api_meta", {}).get("version", 1)

        # Get minimal dataset fields from legacy json
        if not is_field_value_provided(self.__class__, "metadata_owner_id", args, kwargs):
            self.attach_metadata_owner()
        if "title" not in kwargs:
            self.title = self.legacy_research_dataset["title"]

    @property
    def is_legacy(self):
        return True

    @property
    def legacy_identifier(self):
        """Legacy database primary key"""
        return self.dataset_json.get("identifier")

    @property
    def legacy_persistent_identifier(self):
        """Resolvable persistent identifier like DOI or URN"""
        return self.legacy_research_dataset.get("preferred_identifier")

    @property
    def metadata_provider_user(self):
        return self.dataset_json.get("metadata_provider_user")

    @property
    def metadata_provider_org(self):
        if org := self.dataset_json.get("metadata_provider_org"):
            return org
        else:
            return self.dataset_json.get("metadata_owner_org")

    @property
    def legacy_research_dataset(self):
        return ensure_dict(self.dataset_json.get("research_dataset") or {})

    @property
    def legacy_data_catalog(self):
        return self.dataset_json.get("data_catalog")

    @property
    def legacy_contract(self):
        if self.contract_json:
            return self.contract_json["contract_json"]

    def create_snapshot(self, **kwargs):
        """Create snapshot of dataset.

        Due to how simple-history works, LegacyDataset will need to be "cast"
        into a normal Dataset for creating a snapshot.
        """
        self.dataset.create_snapshot(**kwargs)

    def convert_checksum_v2_to_v3(self, checksum: dict, value_key="value") -> Optional[str]:
        if not checksum:
            return None
        ensure_dict(checksum)
        algorithm = checksum.get("algorithm", "").lower().replace("-", "")
        value = checksum.get(value_key, "").lower()
        return f"{algorithm}:{value}"

    def validate_identifiers(self):
        if not self.legacy_identifier:
            raise serializers.ValidationError(
                {"dataset_json__identifier": _("Value is required.")}
            )
        try:
            uuid.UUID(self.legacy_identifier)
        except ValueError:
            raise serializers.ValidationError(
                {"dataset_json__identifier": _("Value is not a valid UUID.")}
            )

    def attach_metadata_owner(self) -> Actor:
        """Creates new MetadataProvider object from metadata-owner field, that is usually CSC-username"""
        metadata_user, user_created = MetaxUser.objects.get_or_create(
            username=self.metadata_provider_user
        )
        owner_created = False
        metadata_owner = MetadataProvider.objects.filter(
            user=metadata_user, organization=self.metadata_provider_org
        ).first()
        if not metadata_owner:
            metadata_owner = MetadataProvider.objects.create(
                user=metadata_user, organization=self.metadata_provider_org
            )
            owner_created = True
        if owner_created:
            self.created_objects.update(["MetadataProvider"])
        if user_created:
            self.created_objects.update(["User"])
        self.metadata_owner = metadata_owner
        return metadata_owner

    def attach_contract(self) -> Contract:
        if self.legacy_contract:
            ensure_dict(self.legacy_contract)
            contract, created = Contract.objects.get_or_create(
                quota=self.legacy_contract["quota"],
                valid_from=self.legacy_contract["validity"]["start_date"],
                description=self.legacy_contract["description"],
                title={"fi": self.legacy_contract["title"]},
                url=self.legacy_contract["identifier"],
            )
            if created:
                self.created_objects.update(["Contract"])
            self.contract = contract
            return contract

    def attach_files(self):
        storage_file_objects = {}
        if files := self.files_json:
            ensure_list(files)
            for f in files:
                file_id = f["identifier"]
                checksum = f.get("checksum", {})

                storage_service = settings.LEGACY_FILE_STORAGE_TO_V3_STORAGE_SERVICE[
                    f["file_storage"]["identifier"]
                ]
                storage = get_or_create_storage(
                    csc_project=f["project_identifier"],
                    storage_service=storage_service,
                )
                new_file, created = File.objects.get_or_create(
                    storage_identifier=file_id,
                    defaults={
                        "checksum": self.convert_checksum_v2_to_v3(checksum),
                        "size": f["byte_size"],
                        "pathname": f["file_path"],
                        "modified": f["file_modified"],
                        "storage_identifier": f["identifier"],
                        "storage": storage,
                    },
                )
                if created:
                    self.created_objects.update(["File"])

                storage_file_objects.setdefault(storage.id, []).append(new_file)

        file_set = None
        for storage_id, file_objects in storage_file_objects.items():
            file_set, created = FileSet.objects.get_or_create(dataset=self, storage_id=storage_id)
            if created:
                self.created_objects.update(["FileSet"])
            file_set.files.set(file_objects)
        return file_set

    def update_from_legacy(self, context=None, raise_serializer_errors=True):
        """Update dataset fields from legacy data dictionaries."""
        if self._state.adding:
            raise ValueError("LegacyDataset needs to be saved before using update_from_legacy.")

        if not context:
            context = {}

        if self.api_version >= 3:
            raise IncompatibleAPIVersion(
                detail="Dataset has been modified with a later API version."
            )

        from apps.core.serializers.legacy_serializer import LegacyDatasetUpdateSerializer

        updated = False
        try:
            with transaction.atomic():  # Undo update if e.g. serialization fails
                self.attach_metadata_owner()
                converter = LegacyDatasetConverter(
                    dataset_json=self.dataset_json, convert_only=False
                )
                data = converter.convert_dataset()
                self.created_objects.update(converter.created_objects)
                self.invalid_legacy_values = converter.get_invalid_values_by_path()
                self.fixed_legacy_values = converter.get_fixed_values_by_path()
                serializer = LegacyDatasetUpdateSerializer(
                    instance=self,
                    data=data,
                    context={**context, "dataset": self, "migrating": True},
                )
                serializer.is_valid(raise_exception=True)
                serializer.save()
                self.attach_files()
                self.attach_contract()
                updated = True
        except serializers.ValidationError as error:
            # Save error details to migration_errors
            detail = error.detail
            if not isinstance(error.detail, list):
                detail = [detail]
            detail = json.loads(json.dumps(detail))
            self.migration_errors = {"serializer_errors": detail}
            self.save()
            if raise_serializer_errors:
                raise
        if updated:
            from apps.core.models.legacy_compatibility import LegacyCompatibility

            compat = LegacyCompatibility(self)
            diff = compat.get_compatibility_diff()
            self.v2_dataset_compatibility_diff = diff
            if migration_errors := compat.get_migration_errors_from_diff(diff):
                self.migration_errors = migration_errors
            else:
                self.migration_errors = None
                self.last_successful_migration = timezone.now()
        self.save()
        return self

    def save(self, *args, **kwargs):
        if str(self.id) != str(self.legacy_identifier):
            raise serializers.ValidationError({"id": _("Value does not match V2 identifier.")})

        self.validate_identifiers()

        if Dataset.objects.filter(id=self.id, legacydataset__isnull=True).exists():
            raise serializers.ValidationError(
                {"id": _("A non-legacy dataset already exists with the same identifier.")}
            )

        self.saving_legacy = True  # Enable less strict validation in Dataset.save
        return super().save(*args, **kwargs)

legacy_identifier property #

Legacy database primary key

legacy_persistent_identifier property #

Resolvable persistent identifier like DOI or URN

attach_metadata_owner() #

Creates new MetadataProvider object from metadata-owner field, that is usually CSC-username

Source code in src/apps/core/models/legacy.py
def attach_metadata_owner(self) -> Actor:
    """Creates new MetadataProvider object from metadata-owner field, that is usually CSC-username"""
    metadata_user, user_created = MetaxUser.objects.get_or_create(
        username=self.metadata_provider_user
    )
    owner_created = False
    metadata_owner = MetadataProvider.objects.filter(
        user=metadata_user, organization=self.metadata_provider_org
    ).first()
    if not metadata_owner:
        metadata_owner = MetadataProvider.objects.create(
            user=metadata_user, organization=self.metadata_provider_org
        )
        owner_created = True
    if owner_created:
        self.created_objects.update(["MetadataProvider"])
    if user_created:
        self.created_objects.update(["User"])
    self.metadata_owner = metadata_owner
    return metadata_owner

create_snapshot(**kwargs) #

Create snapshot of dataset.

Due to how simple-history works, LegacyDataset will need to be "cast" into a normal Dataset for creating a snapshot.

Source code in src/apps/core/models/legacy.py
def create_snapshot(self, **kwargs):
    """Create snapshot of dataset.

    Due to how simple-history works, LegacyDataset will need to be "cast"
    into a normal Dataset for creating a snapshot.
    """
    self.dataset.create_snapshot(**kwargs)

update_from_legacy(context=None, raise_serializer_errors=True) #

Update dataset fields from legacy data dictionaries.

Source code in src/apps/core/models/legacy.py
def update_from_legacy(self, context=None, raise_serializer_errors=True):
    """Update dataset fields from legacy data dictionaries."""
    if self._state.adding:
        raise ValueError("LegacyDataset needs to be saved before using update_from_legacy.")

    if not context:
        context = {}

    if self.api_version >= 3:
        raise IncompatibleAPIVersion(
            detail="Dataset has been modified with a later API version."
        )

    from apps.core.serializers.legacy_serializer import LegacyDatasetUpdateSerializer

    updated = False
    try:
        with transaction.atomic():  # Undo update if e.g. serialization fails
            self.attach_metadata_owner()
            converter = LegacyDatasetConverter(
                dataset_json=self.dataset_json, convert_only=False
            )
            data = converter.convert_dataset()
            self.created_objects.update(converter.created_objects)
            self.invalid_legacy_values = converter.get_invalid_values_by_path()
            self.fixed_legacy_values = converter.get_fixed_values_by_path()
            serializer = LegacyDatasetUpdateSerializer(
                instance=self,
                data=data,
                context={**context, "dataset": self, "migrating": True},
            )
            serializer.is_valid(raise_exception=True)
            serializer.save()
            self.attach_files()
            self.attach_contract()
            updated = True
    except serializers.ValidationError as error:
        # Save error details to migration_errors
        detail = error.detail
        if not isinstance(error.detail, list):
            detail = [detail]
        detail = json.loads(json.dumps(detail))
        self.migration_errors = {"serializer_errors": detail}
        self.save()
        if raise_serializer_errors:
            raise
    if updated:
        from apps.core.models.legacy_compatibility import LegacyCompatibility

        compat = LegacyCompatibility(self)
        diff = compat.get_compatibility_diff()
        self.v2_dataset_compatibility_diff = diff
        if migration_errors := compat.get_migration_errors_from_diff(diff):
            self.migration_errors = migration_errors
        else:
            self.migration_errors = None
            self.last_successful_migration = timezone.now()
    self.save()
    return self