Source code for pyDataverse.models

"""Dataverse data-types data model."""
from __future__ import absolute_import

import json
import os

from pyDataverse.utils import validate_data


INTERNAL_ATTRIBUTES = [
    "_default_json_format",
    "_default_json_schema_filename",
    "_allowed_json_formats",
    "_json_dataverse_upload_attr",
    "_internal_attributes",
]


[docs]class DVObject: """Base class for the Dataverse data types `Dataverse`, `Dataset` and `Datafile`.""" def __init__(self, data=None): """Init :class:`DVObject`. Parameters ---------- data : dict Flat dictionary. All keys will be mapped to a similar named attribute and it's value. """ if data is not None: self.set(data)
[docs] def set(self, data): """Set class attributes by a flat dictionary. The flat dict is the main way to set the class attributes. It is the main interface between the object and the outside world. Parameters ---------- data : dict Flat dictionary. All keys will be mapped to a similar named attribute and it's value. Returns ------- bool `True` if all attributes are set, `False` if wrong data type was passed. """ assert isinstance(data, dict) for key, val in data.items(): if key in self._internal_attributes: print("Importing attribute {0} not allowed.".format(key)) else: self.__setattr__(key, val)
[docs] def get(self): """Create flat `dict` of all attributes. Creates :class:`dict` with all attributes in a flat structure. The flat :class:`dict` can then be used for further processing. Returns ------- dict Data in a flat data structure. """ data = {} for attr in list(self.__dict__.keys()): if attr not in INTERNAL_ATTRIBUTES: data[attr] = self.__getattribute__(attr) assert isinstance(data, dict) return data
[docs] def validate_json(self, filename_schema=None): """Validate JSON formats. Check if JSON data structure is valid. Parameters ---------- filename_schema : str Filename of JSON schema with full path. Returns ------- bool `True` if JSON validates correctly, `False` if not. """ if filename_schema is None: filename_schema = os.path.join( os.path.dirname(os.path.realpath(__file__)), self._default_json_schema_filename, ) assert isinstance(filename_schema, str) return validate_data( json.loads(self.json(validate=False)), filename_schema, file_format="json", )
[docs] def from_json( self, json_str, data_format=None, validate=True, filename_schema=None ): """Import metadata from a JSON file. Parses in the metadata from different JSON formats. Parameters ---------- json_str : str JSON string to be imported. data_format : str Data formats available for import. See `_allowed_json_formats`. validate : bool `True`, if imported JSON should be validated against a JSON schema file. `False`, if JSON string should be imported directly and not checked if valid. filename_schema : str Filename of JSON schema with full path. Returns ------- bool `True` if JSON imported correctly, `False` if not. """ assert isinstance(json_str, str) json_dict = json.loads(json_str) assert isinstance(json_dict, dict) assert isinstance(validate, bool) if data_format is None: data_format = self._default_json_format assert isinstance(data_format, str) assert data_format in self._allowed_json_formats if filename_schema is None: filename_schema = os.path.join( os.path.dirname(os.path.realpath(__file__)), self._default_json_schema_filename, ) assert isinstance(filename_schema, str) data = {} if data_format == "dataverse_upload": if validate: validate_data(json_dict, filename_schema) # get first level metadata and parse it automatically for key in json_dict.keys(): if key in self._json_dataverse_upload_attr: data[key] = json_dict[key] else: print( "INFO: Attribute {0} not valid for import (data format=`{1}`).".format( key, data_format ) ) elif data_format == "dataverse_download": print("INFO: Not implemented yet.") elif data_format == "dspace": print("INFO: Not implemented yet.") elif data_format == "custom": print("INFO: Not implemented yet.") else: # TODO: add exception for wrong data format pass self.set(data)
[docs] def json(self, data_format=None, validate=True, filename_schema=None): r"""Create JSON from :class:`DVObject` attributes. Parameters ---------- data_format : str Data formats to be validated. See `_allowed_json_formats`. validate : bool `True`, if created JSON should be validated against a JSON schema file. `False`, if JSON string should be created and not checked if valid. filename_schema : str Filename of JSON schema with full path. Returns ------- str The data as a JSON string. """ assert isinstance(validate, bool) if data_format is None: data_format = self._default_json_format assert isinstance(data_format, str) assert data_format in self._allowed_json_formats if filename_schema is None: filename_schema = os.path.join( os.path.dirname(os.path.realpath(__file__)), self._default_json_schema_filename, ) assert isinstance(filename_schema, str) data = {} if data_format == "dataverse_upload": for attr in self._json_dataverse_upload_attr: # check if attribute exists if hasattr(self, attr): data[attr] = self.__getattribute__(attr) elif data_format == "dspace": print("INFO: Not implemented yet.") return False elif data_format == "custom": print("INFO: Not implemented yet.") return False if validate: validate_data(data, filename_schema) json_str = json.dumps(data, indent=2) assert isinstance(json_str, str) return json_str
[docs]class Dataverse(DVObject): """Base class for the Dataverse data type `Dataverse`. Attributes ---------- _default_json_format : str Default JSON data format. _default_json_schema_filename : str Default JSON schema filename. _allowed_json_formats : list List of all possible JSON data formats. _json_dataverse_upload_attr : list List of all attributes to be exported in :func:`json`. """ def __init__(self, data=None): """Init :class:`Dataverse()`. Inherits attributes from parent :class:`DVObject()` Parameters ---------- data : dict Flat dictionary. All keys will be mapped to a similar named attribute and it's value. Examples ------- Create a Dataverse:: >>> from pyDataverse.models import Dataverse >>> dv = Dataverse() >>> print(dv._default_json_schema_filename) 'schemas/json/dataverse_upload_schema.json' """ self._internal_attributes = [ "_Dataverse" + attr for attr in INTERNAL_ATTRIBUTES ] super().__init__(data=data) self._default_json_format = "dataverse_upload" self._default_json_schema_filename = "schemas/json/dataverse_upload_schema.json" self._allowed_json_formats = ["dataverse_upload", "dataverse_download"] self._json_dataverse_upload_attr = [ "affiliation", "alias", "dataverseContacts", "dataverseType", "description", "name", ]
[docs]class Dataset(DVObject): """Base class for the Dataverse data type `Dataset`. Attributes ---------- _default_json_format : str Default JSON data format. _default_json_schema_filename : str Default JSON schema filename. _allowed_json_formats : list List of all possible JSON data formats. _json_dataverse_upload_attr : list List with all attributes to be exported in :func:`json`. __attr_import_dv_up_datasetVersion_values : list Dataverse API Upload Dataset JSON attributes inside ds[\'datasetVersion\']. __attr_import_dv_up_citation_fields_values : list Dataverse API Upload Dataset JSON attributes inside ds[\'datasetVersion\'][\'metadataBlocks\'][\'citation\'][\'fields\']. __attr_import_dv_up_citation_fields_arrays : dict Dataverse API Upload Dataset JSON attributes inside [\'datasetVersion\'][\'metadataBlocks\'][\'citation\'][\'fields\']. __attr_import_dv_up_geospatial_fields_values : list Attributes of Dataverse API Upload Dataset JSON metadata standard inside [\'datasetVersion\'][\'metadataBlocks\'][\'geospatial\'][\'fields\']. __attr_import_dv_up_geospatial_fields_arrays : dict Attributes of Dataverse API Upload Dataset JSON metadata standard inside [\'datasetVersion\'][\'metadataBlocks\'][\'geospatial\'][\'fields\']. __attr_import_dv_up_socialscience_fields_values : list Attributes of Dataverse API Upload Dataset JSON metadata standard inside [\'datasetVersion\'][\'metadataBlocks\'][\'socialscience\'][\'fields\']. __attr_import_dv_up_journal_fields_values : list Attributes of Dataverse API Upload Dataset JSON metadata standard inside [\'datasetVersion\'][\'metadataBlocks\'][\'journal\'][\'fields\']. __attr_import_dv_up_journal_fields_arrays : dict Attributes of Dataverse API Upload Dataset JSON metadata standard inside [\'datasetVersion\'][\'metadataBlocks\'][\'journal\'][\'fields\']. __attr_dict_dv_up_required :list Required attributes for valid `dv_up` metadata dict creation. __attr_dict_dv_up_type_class_primitive : list typeClass primitive. __attr_dict_dv_up_type_class_compound : list typeClass compound. __attr_dict_dv_up_type_class_controlled_vocabulary : list typeClass controlledVocabulary. __attr_dict_dv_up_single_dict : list This attributes are excluded from automatic parsing in ds.get() creation. __attr_displayNames : list Attributes of displayName. """ __attr_import_dv_up_datasetVersion_values = [ "license", "termsOfAccess", "fileAccessRequest", "protocol", "authority", "identifier", "termsOfUse", ] __attr_import_dv_up_citation_fields_values = [ "accessToSources", "alternativeTitle", "alternativeURL", "characteristicOfSources", "dateOfDeposit", "dataSources", "depositor", "distributionDate", "kindOfData", "language", "notesText", "originOfSources", "otherReferences", "productionDate", "productionPlace", "relatedDatasets", "relatedMaterial", "subject", "subtitle", "title", ] __attr_import_dv_up_citation_fields_arrays = { "author": [ "authorName", "authorAffiliation", "authorIdentifierScheme", "authorIdentifier", ], "contributor": ["contributorType", "contributorName"], "dateOfCollection": ["dateOfCollectionStart", "dateOfCollectionEnd"], "datasetContact": [ "datasetContactName", "datasetContactAffiliation", "datasetContactEmail", ], "distributor": [ "distributorName", "distributorAffiliation", "distributorAbbreviation", "distributorURL", "distributorLogoURL", ], "dsDescription": ["dsDescriptionValue", "dsDescriptionDate"], "grantNumber": ["grantNumberAgency", "grantNumberValue"], "keyword": ["keywordValue", "keywordVocabulary", "keywordVocabularyURI"], "producer": [ "producerName", "producerAffiliation", "producerAbbreviation", "producerURL", "producerLogoURL", ], "otherId": ["otherIdAgency", "otherIdValue"], "publication": [ "publicationCitation", "publicationIDType", "publicationIDNumber", "publicationURL", ], "software": ["softwareName", "softwareVersion"], "timePeriodCovered": ["timePeriodCoveredStart", "timePeriodCoveredEnd"], "topicClassification": [ "topicClassValue", "topicClassVocab", "topicClassVocabURI", ], } __attr_import_dv_up_geospatial_fields_values = ["geographicUnit"] __attr_import_dv_up_geospatial_fields_arrays = { "geographicBoundingBox": [ "westLongitude", "eastLongitude", "northLongitude", "southLongitude", ], "geographicCoverage": ["country", "state", "city", "otherGeographicCoverage"], } __attr_import_dv_up_socialscience_fields_values = [ "actionsToMinimizeLoss", "cleaningOperations", "collectionMode", "collectorTraining", "controlOperations", "dataCollectionSituation", "dataCollector", "datasetLevelErrorNotes", "deviationsFromSampleDesign", "frequencyOfDataCollection", "otherDataAppraisal", "researchInstrument", "responseRate", "samplingErrorEstimates", "samplingProcedure", "unitOfAnalysis", "universe", "timeMethod", "weighting", ] __attr_import_dv_up_journal_fields_values = ["journalArticleType"] __attr_import_dv_up_journal_fields_arrays = { "journalVolumeIssue": ["journalVolume", "journalIssue", "journalPubDate"] } __attr_dict_dv_up_required = [ "author", "datasetContact", "dsDescription", "subject", "title", ] __attr_dict_dv_up_type_class_primitive = ( [ "accessToSources", "alternativeTitle", "alternativeURL", "authorAffiliation", "authorIdentifier", "authorName", "characteristicOfSources", "city", "contributorName", "dateOfDeposit", "dataSources", "depositor", "distributionDate", "kindOfData", "notesText", "originOfSources", "otherGeographicCoverage", "otherReferences", "productionDate", "productionPlace", "publicationCitation", "publicationIDNumber", "publicationURL", "relatedDatasets", "relatedMaterial", "seriesInformation", "seriesName", "state", "subtitle", "title", ] + __attr_import_dv_up_citation_fields_arrays["dateOfCollection"] + __attr_import_dv_up_citation_fields_arrays["datasetContact"] + __attr_import_dv_up_citation_fields_arrays["distributor"] + __attr_import_dv_up_citation_fields_arrays["dsDescription"] + __attr_import_dv_up_citation_fields_arrays["grantNumber"] + __attr_import_dv_up_citation_fields_arrays["keyword"] + __attr_import_dv_up_citation_fields_arrays["producer"] + __attr_import_dv_up_citation_fields_arrays["otherId"] + __attr_import_dv_up_citation_fields_arrays["software"] + __attr_import_dv_up_citation_fields_arrays["timePeriodCovered"] + __attr_import_dv_up_citation_fields_arrays["topicClassification"] + __attr_import_dv_up_geospatial_fields_values + __attr_import_dv_up_geospatial_fields_arrays["geographicBoundingBox"] + __attr_import_dv_up_socialscience_fields_values + __attr_import_dv_up_journal_fields_arrays["journalVolumeIssue"] + [ "socialScienceNotesType", "socialScienceNotesSubject", "socialScienceNotesText", ] + ["targetSampleActualSize", "targetSampleSizeFormula"] ) __attr_dict_dv_up_type_class_compound = ( list(__attr_import_dv_up_citation_fields_arrays.keys()) + list(__attr_import_dv_up_geospatial_fields_arrays.keys()) + list(__attr_import_dv_up_journal_fields_arrays.keys()) + ["series", "socialScienceNotes", "targetSampleSize"] ) __attr_dict_dv_up_type_class_controlled_vocabulary = [ "authorIdentifierScheme", "contributorType", "country", "journalArticleType", "language", "publicationIDType", "subject", ] __attr_dict_dv_up_single_dict = ["series", "socialScienceNotes", "targetSampleSize"] __attr_displayNames = [ "citation_displayName", "geospatial_displayName", "socialscience_displayName", "journal_displayName", ] def __init__(self, data=None): """Init a Dataset() class. Parameters ---------- data : dict Flat dictionary. All keys will be mapped to a similar named attribute and it's value. Examples ------- Create a Dataset:: >>> from pyDataverse.models import Dataset >>> ds = Dataset() >>> print(ds._default_json_schema_filename) 'schemas/json/dataset_upload_default_schema.json' """ self._internal_attributes = ["_Dataset" + attr for attr in INTERNAL_ATTRIBUTES] super().__init__(data=data) self._default_json_format = "dataverse_upload" self._default_json_schema_filename = ( "schemas/json/dataset_upload_default_schema.json" ) self._allowed_json_formats = [ "dataverse_upload", "dataverse_download", "dspace", "custom", ] self._json_dataverse_upload_attr = [ "license", "termsOfUse", "termsOfAccess", "fileAccessRequest", "protocol", "authority", "identifier", "citation_displayName", "title", "subtitle", "alternativeTitle", "alternativeURL", "otherId", "author", "datasetContact", "dsDescription", "subject", "keyword", "topicClassification", "publication", "notesText", "producer", "productionDate", "productionPlace", "contributor", "grantNumber", "distributor", "distributionDate", "depositor", "dateOfDeposit", "timePeriodCovered", "dateOfCollection", "kindOfData", "language", "series", "software", "relatedMaterial", "relatedDatasets", "otherReferences", "dataSources", "originOfSources", "characteristicOfSources", "accessToSources", "geospatial_displayName", "geographicCoverage", "geographicUnit", "geographicBoundingBox", "socialscience_displayName", "unitOfAnalysis", "universe", "timeMethod", "dataCollector", "collectorTraining", "frequencyOfDataCollection", "samplingProcedure", "targetSampleSize", "deviationsFromSampleDesign", "collectionMode", "researchInstrument", "dataCollectionSituation", "actionsToMinimizeLoss", "controlOperations", "weighting", "cleaningOperations", "datasetLevelErrorNotes", "responseRate", "samplingErrorEstimates", "otherDataAppraisal", "socialScienceNotes", "journal_displayName", "journalVolumeIssue", "journalArticleType", ]
[docs] def validate_json(self, filename_schema=None): """Validate JSON formats of Dataset. Check if JSON data structure is valid. Parameters ---------- filename_schema : str Filename of JSON schema with full path. Returns ------- bool `True` if JSON validate correctly, `False` if not. Examples ------- Check if JSON is valid for Dataverse API upload:: >>> from pyDataverse.models import Dataset >>> ds = Dataset() >>> data = { >>> 'title': 'pyDataverse study 2019', >>> 'dsDescription': [ >>> {'dsDescriptionValue': 'New study about pyDataverse usage in 2019'} >>> ] >>> } >>> ds.set(data) >>> print(ds.validate_json()) False >>> ds.author = [{'authorName': 'LastAuthor1, FirstAuthor1'}] >>> ds.datasetContact = [{'datasetContactName': 'LastContact1, FirstContact1'}] >>> ds.subject = ['Engineering'] >>> print(ds.validate_json()) True """ if filename_schema is None: filename_schema = os.path.join( os.path.dirname(os.path.realpath(__file__)), self._default_json_schema_filename, ) assert isinstance(filename_schema, str) is_valid = True data_json = self.json(validate=False) if data_json: is_valid = validate_data( json.loads(data_json), filename_schema, file_format="json" ) if not is_valid: return False else: return False # check if all required attributes are set for attr in self.__attr_dict_dv_up_required: if attr in list(self.__dict__.keys()): if not self.__getattribute__(attr): is_valid = False print("Attribute '{0}' is `False`.".format(attr)) else: is_valid = False print("Attribute '{0}' missing.".format(attr)) # check if attributes set are complete where necessary if "timePeriodCovered" in list(self.__dict__.keys()): tp_cov = self.__getattribute__("timePeriodCovered") if tp_cov: for tp in tp_cov: if "timePeriodCoveredStart" in tp or "timePeriodCoveredEnd" in tp: if not ( "timePeriodCoveredStart" in tp and "timePeriodCoveredEnd" in tp ): is_valid = False print("timePeriodCovered attribute missing.") if "dateOfCollection" in list(self.__dict__.keys()): d_coll = self.__getattribute__("dateOfCollection") if d_coll: for d in d_coll: if "dateOfCollectionStart" in d or "dateOfCollectionEnd" in d: if not ( "dateOfCollectionStart" in d and "dateOfCollectionEnd" in d ): is_valid = False print("dateOfCollection attribute missing.") if "author" in list(self.__dict__.keys()): authors = self.__getattribute__("author") if authors: for a in authors: if ( "authorAffiliation" in a or "authorIdentifierScheme" in a or "authorIdentifier" in a ): if "authorName" not in a: is_valid = False print("author attribute missing.") if "datasetContact" in list(self.__dict__.keys()): ds_contac = self.__getattribute__("datasetContact") if ds_contac: for c in ds_contac: if "datasetContactAffiliation" in c or "datasetContactEmail" in c: if "datasetContactName" not in c: is_valid = False print("datasetContact attribute missing.") if "producer" in list(self.__dict__.keys()): producer = self.__getattribute__("producer") if producer: for p in producer: if ( "producerAffiliation" in p or "producerAbbreviation" in p or "producerURL" in p or "producerLogoURL" in p ): if not p["producerName"]: is_valid = False print("producer attribute missing.") if "contributor" in list(self.__dict__.keys()): contributor = self.__getattribute__("contributor") if contributor: for c in contributor: if "contributorType" in c: if "contributorName" not in c: is_valid = False print("contributor attribute missing.") if "distributor" in list(self.__dict__.keys()): distributor = self.__getattribute__("distributor") if distributor: for d in distributor: if ( "distributorAffiliation" in d or "distributorAbbreviation" in d or "distributorURL" in d or "distributorLogoURL" in d ): if "distributorName" not in d: is_valid = False print("distributor attribute missing.") if "geographicBoundingBox" in list(self.__dict__.keys()): bbox = self.__getattribute__("geographicBoundingBox") if bbox: for b in bbox: if b: if not ( "westLongitude" in b and "eastLongitude" in b and "northLongitude" in b and "southLongitude" in b ): is_valid = False print("geographicBoundingBox attribute missing.") assert isinstance(is_valid, bool) return is_valid
[docs] def from_json( self, json_str, data_format=None, validate=True, filename_schema=None ): """Import Dataset metadata from JSON file. Parses in the metadata of a Dataset from different JSON formats. Parameters ---------- json_str : str JSON string to be imported. data_format : str Data formats available for import. See `_allowed_json_formats`. validate : bool `True`, if imported JSON should be validated against a JSON schema file. `False`, if JSON string should be imported directly and not checked if valid. filename_schema : str Filename of JSON schema with full path. Examples ------- Set Dataverse attributes via flat :class:`dict`:: >>> from pyDataverse.models import Dataset >>> ds = Dataset() >>> ds.from_json('tests/data/dataset_upload_min_default.json') >>> ds.title 'Darwin's Finches' """ assert isinstance(json_str, str) json_dict = json.loads(json_str) assert isinstance(json_dict, dict) assert isinstance(validate, bool) if data_format is None: data_format = self._default_json_format assert isinstance(data_format, str) assert data_format in self._allowed_json_formats if filename_schema is None: filename_schema = os.path.join( os.path.dirname(os.path.realpath(__file__)), self._default_json_schema_filename, ) assert isinstance(filename_schema, str) data = {} if data_format == "dataverse_upload": if validate: validate_data(json_dict, filename_schema, file_format="json") # dataset # get first level metadata and parse it automatically for key, val in json_dict["datasetVersion"].items(): if not key == "metadataBlocks": if key in self.__attr_import_dv_up_datasetVersion_values: data[key] = val else: print( "Attribute {0} not valid for import (format={1}).".format( key, data_format ) ) if "metadataBlocks" in json_dict["datasetVersion"]: # citation if "citation" in json_dict["datasetVersion"]["metadataBlocks"]: citation = json_dict["datasetVersion"]["metadataBlocks"]["citation"] if "displayName" in citation: data["citation_displayName"] = citation["displayName"] for field in citation["fields"]: if ( field["typeName"] in self.__attr_import_dv_up_citation_fields_values ): data[field["typeName"]] = field["value"] elif ( field["typeName"] in self.__attr_import_dv_up_citation_fields_arrays ): data[field["typeName"]] = self.__parse_field_array( field["value"], self.__attr_import_dv_up_citation_fields_arrays[ field["typeName"] ], ) elif field["typeName"] == "series": data["series"] = {} if "seriesName" in field["value"]: data["series"]["seriesName"] = field["value"][ "seriesName" ]["value"] if "seriesInformation" in field["value"]: data["series"]["seriesInformation"] = field["value"][ "seriesInformation" ]["value"] else: print( "Attribute {0} not valid for import (dv_up).".format( field["typeName"] ) ) else: # TODO: Exception pass # geospatial if "geospatial" in json_dict["datasetVersion"]["metadataBlocks"]: geospatial = json_dict["datasetVersion"]["metadataBlocks"][ "geospatial" ] if "displayName" in geospatial: self.__setattr__( "geospatial_displayName", geospatial["displayName"] ) for field in geospatial["fields"]: if ( field["typeName"] in self.__attr_import_dv_up_geospatial_fields_values ): data[field["typeName"]] = field["value"] elif ( field["typeName"] in self.__attr_import_dv_up_geospatial_fields_arrays ): data[field["typeName"]] = self.__parse_field_array( field["value"], self.__attr_import_dv_up_geospatial_fields_arrays[ field["typeName"] ], ) else: print( "Attribute {0} not valid for import (dv_up).".format( field["typeName"] ) ) else: # TODO: Exception pass # socialscience if "socialscience" in json_dict["datasetVersion"]["metadataBlocks"]: socialscience = json_dict["datasetVersion"]["metadataBlocks"][ "socialscience" ] if "displayName" in socialscience: self.__setattr__( "socialscience_displayName", socialscience["displayName"], ) for field in socialscience["fields"]: if ( field["typeName"] in self.__attr_import_dv_up_socialscience_fields_values ): data[field["typeName"]] = field["value"] elif field["typeName"] == "targetSampleSize": data["targetSampleSize"] = {} if "targetSampleActualSize" in field["value"]: data["targetSampleSize"][ "targetSampleActualSize" ] = field["value"]["targetSampleActualSize"]["value"] if "targetSampleSizeFormula" in field["value"]: data["targetSampleSize"][ "targetSampleSizeFormula" ] = field["value"]["targetSampleSizeFormula"]["value"] elif field["typeName"] == "socialScienceNotes": data["socialScienceNotes"] = {} if "socialScienceNotesType" in field["value"]: data["socialScienceNotes"][ "socialScienceNotesType" ] = field["value"]["socialScienceNotesType"]["value"] if "socialScienceNotesSubject" in field["value"]: data["socialScienceNotes"][ "socialScienceNotesSubject" ] = field["value"]["socialScienceNotesSubject"]["value"] if "socialScienceNotesText" in field["value"]: data["socialScienceNotes"][ "socialScienceNotesText" ] = field["value"]["socialScienceNotesText"]["value"] else: print( "Attribute {0} not valid for import (dv_up).".format( field["typeName"] ) ) else: # TODO: Exception pass # journal if "journal" in json_dict["datasetVersion"]["metadataBlocks"]: journal = json_dict["datasetVersion"]["metadataBlocks"]["journal"] if "displayName" in journal: self.__setattr__("journal_displayName", journal["displayName"]) for field in journal["fields"]: if ( field["typeName"] in self.__attr_import_dv_up_journal_fields_values ): data[field["typeName"]] = field["value"] elif ( field["typeName"] in self.__attr_import_dv_up_journal_fields_arrays ): data[field["typeName"]] = self.__parse_field_array( field["value"], self.__attr_import_dv_up_journal_fields_arrays[ field["typeName"] ], ) else: print( "Attribute {0} not valid for import (dv_up).".format( field["typeName"] ) ) else: # TODO: Exception pass elif data_format == "dataverse_download": print("INFO: Not implemented yet.") elif data_format == "dspace": print("INFO: Not implemented yet.") elif data_format == "custom": print("INFO: Not implemented yet.") self.set(data)
def __parse_field_array(self, data, attr_list): """Parse arrays of Dataset upload format. Parameters ---------- data : list List of dictionaries of a specific Dataverse API metadata field. attr_list : list List of attributes to be parsed. Returns ------- list List of :class:`dict`s with parsed out key-value pairs. """ assert isinstance(data, list) assert isinstance(attr_list, list) data_tmp = [] for d in data: tmp_dict = {} for key, val in d.items(): if key in attr_list: tmp_dict[key] = val["value"] else: print("Key '{0}' not in attribute list".format(key)) data_tmp.append(tmp_dict) assert isinstance(data_tmp, list) return data_tmp def __generate_field_arrays(self, key, sub_keys): """Generate dicts for array attributes of Dataverse API metadata upload. Parameters ---------- key : str Name of attribute. sub_keys : list List of keys to be created. Returns ------- list List of filled :class:`dict`s of metadata for Dataverse API upload. """ assert isinstance(key, str) assert isinstance(sub_keys, list) # check if attribute exists tmp_list = [] if self.__getattribute__(key): attr = self.__getattribute__(key) # loop over list of attribute dict for d in attr: tmp_dict = {} # iterate over key-value pairs for k, v in d.items(): # check if key is in attribute list if k in sub_keys: multiple = None type_class = None if isinstance(v, list): multiple = True else: multiple = False if k in self.__attr_dict_dv_up_type_class_primitive: type_class = "primitive" elif k in self.__attr_dict_dv_up_type_class_compound: type_class = "compound" elif ( k in self.__attr_dict_dv_up_type_class_controlled_vocabulary ): type_class = "controlledVocabulary" tmp_dict[k] = {} tmp_dict[k]["typeName"] = k tmp_dict[k]["typeClass"] = type_class tmp_dict[k]["multiple"] = multiple tmp_dict[k]["value"] = v tmp_list.append(tmp_dict) assert isinstance(tmp_list, list) return tmp_list
[docs] def json(self, data_format=None, validate=True, filename_schema=None): """Create Dataset JSON from attributes. Parameters ---------- format : str Data formats to be validated. See `_allowed_json_formats`. validate : bool `True`, if created JSON should be validated against a JSON schema file. `False`, if JSON string should be created and not checked if valid. filename_schema : str Filename of JSON schema with full path. Returns ------- str The data as a JSON string. """ assert isinstance(validate, bool) if data_format is None: data_format = self._default_json_format assert isinstance(data_format, str) assert data_format in self._allowed_json_formats if filename_schema is None: filename_schema = os.path.join( os.path.dirname(os.path.realpath(__file__)), self._default_json_schema_filename, ) assert isinstance(filename_schema, str) data = {} if data_format == "dataverse_upload": data_dict = self.get() data["datasetVersion"] = {} data["datasetVersion"]["metadataBlocks"] = {} citation = {} citation["fields"] = [] # dataset # Generate first level attributes for attr in self.__attr_import_dv_up_datasetVersion_values: if attr in data_dict: data["datasetVersion"][attr] = data_dict[attr] # citation if "citation_displayName" in data_dict: citation["displayName"] = data_dict["citation_displayName"] # Generate first level attributes for attr in self.__attr_import_dv_up_citation_fields_values: if attr in data_dict: v = data_dict[attr] if isinstance(v, list): multiple = True else: multiple = False if attr in self.__attr_dict_dv_up_type_class_primitive: type_class = "primitive" elif attr in self.__attr_dict_dv_up_type_class_compound: type_class = "compound" elif ( attr in self.__attr_dict_dv_up_type_class_controlled_vocabulary ): type_class = "controlledVocabulary" citation["fields"].append( { "typeName": attr, "multiple": multiple, "typeClass": type_class, "value": v, } ) # Generate fields attributes for (key, val,) in self.__attr_import_dv_up_citation_fields_arrays.items(): if key in data_dict: v = data_dict[key] citation["fields"].append( { "typeName": key, "multiple": True, "typeClass": "compound", "value": self.__generate_field_arrays(key, val), } ) # Generate series attributes if "series" in data_dict: series = data_dict["series"] tmp_dict = {} if "seriesName" in series: if series["seriesName"] is not None: tmp_dict["seriesName"] = {} tmp_dict["seriesName"]["typeName"] = "seriesName" tmp_dict["seriesName"]["multiple"] = False tmp_dict["seriesName"]["typeClass"] = "primitive" tmp_dict["seriesName"]["value"] = series["seriesName"] if "seriesInformation" in series: if series["seriesInformation"] is not None: tmp_dict["seriesInformation"] = {} tmp_dict["seriesInformation"]["typeName"] = "seriesInformation" tmp_dict["seriesInformation"]["multiple"] = False tmp_dict["seriesInformation"]["typeClass"] = "primitive" tmp_dict["seriesInformation"]["value"] = series[ "seriesInformation" ] citation["fields"].append( { "typeName": "series", "multiple": False, "typeClass": "compound", "value": tmp_dict, } ) # geospatial for attr in ( self.__attr_import_dv_up_geospatial_fields_values + list(self.__attr_import_dv_up_geospatial_fields_arrays.keys()) + ["geospatial_displayName"] ): if attr in data_dict: geospatial = {} if attr != "geospatial_displayName": geospatial["fields"] = [] break if "geospatial_displayName" in data_dict: geospatial["displayName"] = data_dict["geospatial_displayName"] # Generate first level attributes for attr in self.__attr_import_dv_up_geospatial_fields_values: if attr in data_dict: v = data_dict[attr] if isinstance(v, list): multiple = True else: multiple = False if attr in self.__attr_dict_dv_up_type_class_primitive: type_class = "primitive" elif attr in self.__attr_dict_dv_up_type_class_compound: type_class = "compound" elif ( attr in self.__attr_dict_dv_up_type_class_controlled_vocabulary ): type_class = "controlledVocabulary" geospatial["fields"].append( { "typeName": attr, "multiple": multiple, "typeClass": type_class, "value": v, } ) # Generate fields attributes for ( key, val, ) in self.__attr_import_dv_up_geospatial_fields_arrays.items(): if key in data_dict: geospatial["fields"].append( { "typeName": key, "multiple": True, "typeClass": "compound", "value": self.__generate_field_arrays(key, val), } ) # socialscience for attr in self.__attr_import_dv_up_socialscience_fields_values + [ "socialscience_displayName" ]: if attr in data_dict: socialscience = {} if attr != "socialscience_displayName": socialscience["fields"] = [] break if "socialscience_displayName" in data_dict: socialscience["displayName"] = data_dict["socialscience_displayName"] # Generate first level attributes for attr in self.__attr_import_dv_up_socialscience_fields_values: if attr in data_dict: v = data_dict[attr] if isinstance(v, list): multiple = True else: multiple = False if attr in self.__attr_dict_dv_up_type_class_primitive: type_class = "primitive" elif attr in self.__attr_dict_dv_up_type_class_compound: type_class = "compound" elif ( attr in self.__attr_dict_dv_up_type_class_controlled_vocabulary ): type_class = "controlledVocabulary" socialscience["fields"].append( { "typeName": attr, "multiple": multiple, "typeClass": type_class, "value": v, } ) # Generate targetSampleSize attributes if "targetSampleSize" in data_dict: target_sample_size = data_dict["targetSampleSize"] tmp_dict = {} if "targetSampleActualSize" in target_sample_size: if target_sample_size["targetSampleActualSize"] is not None: tmp_dict["targetSampleActualSize"] = {} tmp_dict["targetSampleActualSize"][ "typeName" ] = "targetSampleActualSize" tmp_dict["targetSampleActualSize"]["multiple"] = False tmp_dict["targetSampleActualSize"]["typeClass"] = "primitive" tmp_dict["targetSampleActualSize"][ "value" ] = target_sample_size["targetSampleActualSize"] if "targetSampleSizeFormula" in target_sample_size: if target_sample_size["targetSampleSizeFormula"] is not None: tmp_dict["targetSampleSizeFormula"] = {} tmp_dict["targetSampleSizeFormula"][ "typeName" ] = "targetSampleSizeFormula" tmp_dict["targetSampleSizeFormula"]["multiple"] = False tmp_dict["targetSampleSizeFormula"]["typeClass"] = "primitive" tmp_dict["targetSampleSizeFormula"][ "value" ] = target_sample_size["targetSampleSizeFormula"] socialscience["fields"].append( { "typeName": "targetSampleSize", "multiple": False, "typeClass": "compound", "value": tmp_dict, } ) # Generate socialScienceNotes attributes if "socialScienceNotes" in data_dict: social_science_notes = data_dict["socialScienceNotes"] tmp_dict = {} if "socialScienceNotesType" in social_science_notes: if social_science_notes["socialScienceNotesType"] is not None: tmp_dict["socialScienceNotesType"] = {} tmp_dict["socialScienceNotesType"][ "typeName" ] = "socialScienceNotesType" tmp_dict["socialScienceNotesType"]["multiple"] = False tmp_dict["socialScienceNotesType"]["typeClass"] = "primitive" tmp_dict["socialScienceNotesType"][ "value" ] = social_science_notes["socialScienceNotesType"] if "socialScienceNotesSubject" in social_science_notes: if social_science_notes["socialScienceNotesSubject"] is not None: tmp_dict["socialScienceNotesSubject"] = {} tmp_dict["socialScienceNotesSubject"][ "typeName" ] = "socialScienceNotesSubject" tmp_dict["socialScienceNotesSubject"]["multiple"] = False tmp_dict["socialScienceNotesSubject"]["typeClass"] = "primitive" tmp_dict["socialScienceNotesSubject"][ "value" ] = social_science_notes["socialScienceNotesSubject"] if "socialScienceNotesText" in social_science_notes: if social_science_notes["socialScienceNotesText"] is not None: tmp_dict["socialScienceNotesText"] = {} tmp_dict["socialScienceNotesText"][ "typeName" ] = "socialScienceNotesText" tmp_dict["socialScienceNotesText"]["multiple"] = False tmp_dict["socialScienceNotesText"]["typeClass"] = "primitive" tmp_dict["socialScienceNotesText"][ "value" ] = social_science_notes["socialScienceNotesText"] socialscience["fields"].append( { "typeName": "socialScienceNotes", "multiple": False, "typeClass": "compound", "value": tmp_dict, } ) # journal for attr in ( self.__attr_import_dv_up_journal_fields_values + list(self.__attr_import_dv_up_journal_fields_arrays.keys()) + ["journal_displayName"] ): if attr in data_dict: journal = {} if attr != "journal_displayName": journal["fields"] = [] break if "journal_displayName" in data_dict: journal["displayName"] = data_dict["journal_displayName"] # Generate first level attributes for attr in self.__attr_import_dv_up_journal_fields_values: if attr in data_dict: v = data_dict[attr] if isinstance(v, list): multiple = True else: multiple = False if attr in self.__attr_dict_dv_up_type_class_primitive: type_class = "primitive" elif attr in self.__attr_dict_dv_up_type_class_compound: type_class = "compound" elif ( attr in self.__attr_dict_dv_up_type_class_controlled_vocabulary ): type_class = "controlledVocabulary" journal["fields"].append( { "typeName": attr, "multiple": multiple, "typeClass": type_class, "value": v, } ) # Generate fields attributes for (key, val,) in self.__attr_import_dv_up_journal_fields_arrays.items(): if key in data_dict: journal["fields"].append( { "typeName": key, "multiple": True, "typeClass": "compound", "value": self.__generate_field_arrays(key, val), } ) data["datasetVersion"]["metadataBlocks"]["citation"] = citation if "socialscience" in locals(): data["datasetVersion"]["metadataBlocks"][ "socialscience" ] = socialscience if "geospatial" in locals(): data["datasetVersion"]["metadataBlocks"]["geospatial"] = geospatial if "journal" in locals(): data["datasetVersion"]["metadataBlocks"]["journal"] = journal elif data_format == "dspace": data = None print("INFO: Not implemented yet.") elif data_format == "custom": data = None print("INFO: Not implemented yet.") if validate: validate_data(data, filename_schema) json_str = json.dumps(data, indent=2) assert isinstance(json_str, str) return json_str
[docs]class Datafile(DVObject): """Base class for the Dataverse data type `Datafile`. Attributes ---------- _default_json_format : str Default JSON data format. _default_json_schema_filename : str Default JSON schema filename. _allowed_json_formats : list List of all possible JSON data formats. _json_dataverse_upload_attr : list List of all attributes to be exported in :func:`json`. """ def __init__(self, data=None): """Init :class:`Datafile()`. Inherits attributes from parent :class:`DVObject()` Parameters ---------- data : dict Flat dictionary. All keys will be mapped to a similar named attribute and it's value. Examples ------- Create a Datafile:: >>> from pyDataverse.models import Datafile >>> df = Datafile() >>> print(df._default_json_schema_filename) 'schemas/json/datafile_upload_schema.json' """ self._internal_attributes = ["_Datafile" + attr for attr in INTERNAL_ATTRIBUTES] super().__init__(data=data) self._default_json_format = "dataverse_upload" self._default_json_schema_filename = "schemas/json/datafile_upload_schema.json" self._allowed_json_formats = ["dataverse_upload", "dataverse_download"] self._json_dataverse_upload_attr = [ "description", "categories", "restrict", "label", "directoryLabel", "pid", "filename", ]