"""Dataverse data-types data model."""
from __future__ import absolute_import
import json
import os
from pyDataverse.utils import validate_data
INTERNAL_ATTRIBUTES = [
"_default_json_format",
"_default_json_schema_filename",
"_allowed_json_formats",
"_json_dataverse_upload_attr",
"_internal_attributes",
]
[docs]class DVObject:
"""Base class for the Dataverse data types `Dataverse`, `Dataset` and `Datafile`."""
def __init__(self, data=None):
"""Init :class:`DVObject`.
Parameters
----------
data : dict
Flat dictionary. All keys will be mapped to a similar
named attribute and it's value.
"""
if data is not None:
self.set(data)
[docs] def set(self, data):
"""Set class attributes by a flat dictionary.
The flat dict is the main way to set the class attributes.
It is the main interface between the object and the outside world.
Parameters
----------
data : dict
Flat dictionary. All keys will be mapped to a similar
named attribute and it's value.
Returns
-------
bool
`True` if all attributes are set, `False` if wrong data type was
passed.
"""
assert isinstance(data, dict)
for key, val in data.items():
if key in self._internal_attributes:
print("Importing attribute {0} not allowed.".format(key))
else:
self.__setattr__(key, val)
[docs] def get(self):
"""Create flat `dict` of all attributes.
Creates :class:`dict` with all attributes in a flat structure.
The flat :class:`dict` can then be used for further processing.
Returns
-------
dict
Data in a flat data structure.
"""
data = {}
for attr in list(self.__dict__.keys()):
if attr not in INTERNAL_ATTRIBUTES:
data[attr] = self.__getattribute__(attr)
assert isinstance(data, dict)
return data
[docs] def validate_json(self, filename_schema=None):
"""Validate JSON formats.
Check if JSON data structure is valid.
Parameters
----------
filename_schema : str
Filename of JSON schema with full path.
Returns
-------
bool
`True` if JSON validates correctly, `False` if not.
"""
if filename_schema is None:
filename_schema = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
self._default_json_schema_filename,
)
assert isinstance(filename_schema, str)
return validate_data(
json.loads(self.json(validate=False)), filename_schema, file_format="json",
)
[docs] def from_json(
self, json_str, data_format=None, validate=True, filename_schema=None
):
"""Import metadata from a JSON file.
Parses in the metadata from different JSON formats.
Parameters
----------
json_str : str
JSON string to be imported.
data_format : str
Data formats available for import. See `_allowed_json_formats`.
validate : bool
`True`, if imported JSON should be validated against a JSON
schema file. `False`, if JSON string should be imported directly and
not checked if valid.
filename_schema : str
Filename of JSON schema with full path.
Returns
-------
bool
`True` if JSON imported correctly, `False` if not.
"""
assert isinstance(json_str, str)
json_dict = json.loads(json_str)
assert isinstance(json_dict, dict)
assert isinstance(validate, bool)
if data_format is None:
data_format = self._default_json_format
assert isinstance(data_format, str)
assert data_format in self._allowed_json_formats
if filename_schema is None:
filename_schema = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
self._default_json_schema_filename,
)
assert isinstance(filename_schema, str)
data = {}
if data_format == "dataverse_upload":
if validate:
validate_data(json_dict, filename_schema)
# get first level metadata and parse it automatically
for key in json_dict.keys():
if key in self._json_dataverse_upload_attr:
data[key] = json_dict[key]
else:
print(
"INFO: Attribute {0} not valid for import (data format=`{1}`).".format(
key, data_format
)
)
elif data_format == "dataverse_download":
print("INFO: Not implemented yet.")
elif data_format == "dspace":
print("INFO: Not implemented yet.")
elif data_format == "custom":
print("INFO: Not implemented yet.")
else:
# TODO: add exception for wrong data format
pass
self.set(data)
[docs] def json(self, data_format=None, validate=True, filename_schema=None):
r"""Create JSON from :class:`DVObject` attributes.
Parameters
----------
data_format : str
Data formats to be validated. See `_allowed_json_formats`.
validate : bool
`True`, if created JSON should be validated against a JSON schema
file. `False`, if JSON string should be created and not checked if
valid.
filename_schema : str
Filename of JSON schema with full path.
Returns
-------
str
The data as a JSON string.
"""
assert isinstance(validate, bool)
if data_format is None:
data_format = self._default_json_format
assert isinstance(data_format, str)
assert data_format in self._allowed_json_formats
if filename_schema is None:
filename_schema = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
self._default_json_schema_filename,
)
assert isinstance(filename_schema, str)
data = {}
if data_format == "dataverse_upload":
for attr in self._json_dataverse_upload_attr:
# check if attribute exists
if hasattr(self, attr):
data[attr] = self.__getattribute__(attr)
elif data_format == "dspace":
print("INFO: Not implemented yet.")
return False
elif data_format == "custom":
print("INFO: Not implemented yet.")
return False
if validate:
validate_data(data, filename_schema)
json_str = json.dumps(data, indent=2)
assert isinstance(json_str, str)
return json_str
[docs]class Dataverse(DVObject):
"""Base class for the Dataverse data type `Dataverse`.
Attributes
----------
_default_json_format : str
Default JSON data format.
_default_json_schema_filename : str
Default JSON schema filename.
_allowed_json_formats : list
List of all possible JSON data formats.
_json_dataverse_upload_attr : list
List of all attributes to be exported in :func:`json`.
"""
def __init__(self, data=None):
"""Init :class:`Dataverse()`.
Inherits attributes from parent :class:`DVObject()`
Parameters
----------
data : dict
Flat dictionary. All keys will be mapped to a similar
named attribute and it's value.
Examples
-------
Create a Dataverse::
>>> from pyDataverse.models import Dataverse
>>> dv = Dataverse()
>>> print(dv._default_json_schema_filename)
'schemas/json/dataverse_upload_schema.json'
"""
self._internal_attributes = [
"_Dataverse" + attr for attr in INTERNAL_ATTRIBUTES
]
super().__init__(data=data)
self._default_json_format = "dataverse_upload"
self._default_json_schema_filename = "schemas/json/dataverse_upload_schema.json"
self._allowed_json_formats = ["dataverse_upload", "dataverse_download"]
self._json_dataverse_upload_attr = [
"affiliation",
"alias",
"dataverseContacts",
"dataverseType",
"description",
"name",
]
[docs]class Dataset(DVObject):
"""Base class for the Dataverse data type `Dataset`.
Attributes
----------
_default_json_format : str
Default JSON data format.
_default_json_schema_filename : str
Default JSON schema filename.
_allowed_json_formats : list
List of all possible JSON data formats.
_json_dataverse_upload_attr : list
List with all attributes to be exported in :func:`json`.
__attr_import_dv_up_datasetVersion_values : list
Dataverse API Upload Dataset JSON attributes inside ds[\'datasetVersion\'].
__attr_import_dv_up_citation_fields_values : list
Dataverse API Upload Dataset JSON attributes inside
ds[\'datasetVersion\'][\'metadataBlocks\'][\'citation\'][\'fields\'].
__attr_import_dv_up_citation_fields_arrays : dict
Dataverse API Upload Dataset JSON attributes inside
[\'datasetVersion\'][\'metadataBlocks\'][\'citation\'][\'fields\'].
__attr_import_dv_up_geospatial_fields_values : list
Attributes of Dataverse API Upload Dataset JSON metadata standard inside
[\'datasetVersion\'][\'metadataBlocks\'][\'geospatial\'][\'fields\'].
__attr_import_dv_up_geospatial_fields_arrays : dict
Attributes of Dataverse API Upload Dataset JSON metadata standard inside
[\'datasetVersion\'][\'metadataBlocks\'][\'geospatial\'][\'fields\'].
__attr_import_dv_up_socialscience_fields_values : list
Attributes of Dataverse API Upload Dataset JSON metadata standard inside
[\'datasetVersion\'][\'metadataBlocks\'][\'socialscience\'][\'fields\'].
__attr_import_dv_up_journal_fields_values : list
Attributes of Dataverse API Upload Dataset JSON metadata standard inside
[\'datasetVersion\'][\'metadataBlocks\'][\'journal\'][\'fields\'].
__attr_import_dv_up_journal_fields_arrays : dict
Attributes of Dataverse API Upload Dataset JSON metadata standard inside
[\'datasetVersion\'][\'metadataBlocks\'][\'journal\'][\'fields\'].
__attr_dict_dv_up_required :list
Required attributes for valid `dv_up` metadata dict creation.
__attr_dict_dv_up_type_class_primitive : list
typeClass primitive.
__attr_dict_dv_up_type_class_compound : list
typeClass compound.
__attr_dict_dv_up_type_class_controlled_vocabulary : list
typeClass controlledVocabulary.
__attr_dict_dv_up_single_dict : list
This attributes are excluded from automatic parsing in ds.get() creation.
__attr_displayNames : list
Attributes of displayName.
"""
__attr_import_dv_up_datasetVersion_values = [
"license",
"termsOfAccess",
"fileAccessRequest",
"protocol",
"authority",
"identifier",
"termsOfUse",
]
__attr_import_dv_up_citation_fields_values = [
"accessToSources",
"alternativeTitle",
"alternativeURL",
"characteristicOfSources",
"dateOfDeposit",
"dataSources",
"depositor",
"distributionDate",
"kindOfData",
"language",
"notesText",
"originOfSources",
"otherReferences",
"productionDate",
"productionPlace",
"relatedDatasets",
"relatedMaterial",
"subject",
"subtitle",
"title",
]
__attr_import_dv_up_citation_fields_arrays = {
"author": [
"authorName",
"authorAffiliation",
"authorIdentifierScheme",
"authorIdentifier",
],
"contributor": ["contributorType", "contributorName"],
"dateOfCollection": ["dateOfCollectionStart", "dateOfCollectionEnd"],
"datasetContact": [
"datasetContactName",
"datasetContactAffiliation",
"datasetContactEmail",
],
"distributor": [
"distributorName",
"distributorAffiliation",
"distributorAbbreviation",
"distributorURL",
"distributorLogoURL",
],
"dsDescription": ["dsDescriptionValue", "dsDescriptionDate"],
"grantNumber": ["grantNumberAgency", "grantNumberValue"],
"keyword": ["keywordValue", "keywordVocabulary", "keywordVocabularyURI"],
"producer": [
"producerName",
"producerAffiliation",
"producerAbbreviation",
"producerURL",
"producerLogoURL",
],
"otherId": ["otherIdAgency", "otherIdValue"],
"publication": [
"publicationCitation",
"publicationIDType",
"publicationIDNumber",
"publicationURL",
],
"software": ["softwareName", "softwareVersion"],
"timePeriodCovered": ["timePeriodCoveredStart", "timePeriodCoveredEnd"],
"topicClassification": [
"topicClassValue",
"topicClassVocab",
"topicClassVocabURI",
],
}
__attr_import_dv_up_geospatial_fields_values = ["geographicUnit"]
__attr_import_dv_up_geospatial_fields_arrays = {
"geographicBoundingBox": [
"westLongitude",
"eastLongitude",
"northLongitude",
"southLongitude",
],
"geographicCoverage": ["country", "state", "city", "otherGeographicCoverage"],
}
__attr_import_dv_up_socialscience_fields_values = [
"actionsToMinimizeLoss",
"cleaningOperations",
"collectionMode",
"collectorTraining",
"controlOperations",
"dataCollectionSituation",
"dataCollector",
"datasetLevelErrorNotes",
"deviationsFromSampleDesign",
"frequencyOfDataCollection",
"otherDataAppraisal",
"researchInstrument",
"responseRate",
"samplingErrorEstimates",
"samplingProcedure",
"unitOfAnalysis",
"universe",
"timeMethod",
"weighting",
]
__attr_import_dv_up_journal_fields_values = ["journalArticleType"]
__attr_import_dv_up_journal_fields_arrays = {
"journalVolumeIssue": ["journalVolume", "journalIssue", "journalPubDate"]
}
__attr_dict_dv_up_required = [
"author",
"datasetContact",
"dsDescription",
"subject",
"title",
]
__attr_dict_dv_up_type_class_primitive = (
[
"accessToSources",
"alternativeTitle",
"alternativeURL",
"authorAffiliation",
"authorIdentifier",
"authorName",
"characteristicOfSources",
"city",
"contributorName",
"dateOfDeposit",
"dataSources",
"depositor",
"distributionDate",
"kindOfData",
"notesText",
"originOfSources",
"otherGeographicCoverage",
"otherReferences",
"productionDate",
"productionPlace",
"publicationCitation",
"publicationIDNumber",
"publicationURL",
"relatedDatasets",
"relatedMaterial",
"seriesInformation",
"seriesName",
"state",
"subtitle",
"title",
]
+ __attr_import_dv_up_citation_fields_arrays["dateOfCollection"]
+ __attr_import_dv_up_citation_fields_arrays["datasetContact"]
+ __attr_import_dv_up_citation_fields_arrays["distributor"]
+ __attr_import_dv_up_citation_fields_arrays["dsDescription"]
+ __attr_import_dv_up_citation_fields_arrays["grantNumber"]
+ __attr_import_dv_up_citation_fields_arrays["keyword"]
+ __attr_import_dv_up_citation_fields_arrays["producer"]
+ __attr_import_dv_up_citation_fields_arrays["otherId"]
+ __attr_import_dv_up_citation_fields_arrays["software"]
+ __attr_import_dv_up_citation_fields_arrays["timePeriodCovered"]
+ __attr_import_dv_up_citation_fields_arrays["topicClassification"]
+ __attr_import_dv_up_geospatial_fields_values
+ __attr_import_dv_up_geospatial_fields_arrays["geographicBoundingBox"]
+ __attr_import_dv_up_socialscience_fields_values
+ __attr_import_dv_up_journal_fields_arrays["journalVolumeIssue"]
+ [
"socialScienceNotesType",
"socialScienceNotesSubject",
"socialScienceNotesText",
]
+ ["targetSampleActualSize", "targetSampleSizeFormula"]
)
__attr_dict_dv_up_type_class_compound = (
list(__attr_import_dv_up_citation_fields_arrays.keys())
+ list(__attr_import_dv_up_geospatial_fields_arrays.keys())
+ list(__attr_import_dv_up_journal_fields_arrays.keys())
+ ["series", "socialScienceNotes", "targetSampleSize"]
)
__attr_dict_dv_up_type_class_controlled_vocabulary = [
"authorIdentifierScheme",
"contributorType",
"country",
"journalArticleType",
"language",
"publicationIDType",
"subject",
]
__attr_dict_dv_up_single_dict = ["series", "socialScienceNotes", "targetSampleSize"]
__attr_displayNames = [
"citation_displayName",
"geospatial_displayName",
"socialscience_displayName",
"journal_displayName",
]
def __init__(self, data=None):
"""Init a Dataset() class.
Parameters
----------
data : dict
Flat dictionary. All keys will be mapped to a similar
named attribute and it's value.
Examples
-------
Create a Dataset::
>>> from pyDataverse.models import Dataset
>>> ds = Dataset()
>>> print(ds._default_json_schema_filename)
'schemas/json/dataset_upload_default_schema.json'
"""
self._internal_attributes = ["_Dataset" + attr for attr in INTERNAL_ATTRIBUTES]
super().__init__(data=data)
self._default_json_format = "dataverse_upload"
self._default_json_schema_filename = (
"schemas/json/dataset_upload_default_schema.json"
)
self._allowed_json_formats = [
"dataverse_upload",
"dataverse_download",
"dspace",
"custom",
]
self._json_dataverse_upload_attr = [
"license",
"termsOfUse",
"termsOfAccess",
"fileAccessRequest",
"protocol",
"authority",
"identifier",
"citation_displayName",
"title",
"subtitle",
"alternativeTitle",
"alternativeURL",
"otherId",
"author",
"datasetContact",
"dsDescription",
"subject",
"keyword",
"topicClassification",
"publication",
"notesText",
"producer",
"productionDate",
"productionPlace",
"contributor",
"grantNumber",
"distributor",
"distributionDate",
"depositor",
"dateOfDeposit",
"timePeriodCovered",
"dateOfCollection",
"kindOfData",
"language",
"series",
"software",
"relatedMaterial",
"relatedDatasets",
"otherReferences",
"dataSources",
"originOfSources",
"characteristicOfSources",
"accessToSources",
"geospatial_displayName",
"geographicCoverage",
"geographicUnit",
"geographicBoundingBox",
"socialscience_displayName",
"unitOfAnalysis",
"universe",
"timeMethod",
"dataCollector",
"collectorTraining",
"frequencyOfDataCollection",
"samplingProcedure",
"targetSampleSize",
"deviationsFromSampleDesign",
"collectionMode",
"researchInstrument",
"dataCollectionSituation",
"actionsToMinimizeLoss",
"controlOperations",
"weighting",
"cleaningOperations",
"datasetLevelErrorNotes",
"responseRate",
"samplingErrorEstimates",
"otherDataAppraisal",
"socialScienceNotes",
"journal_displayName",
"journalVolumeIssue",
"journalArticleType",
]
[docs] def validate_json(self, filename_schema=None):
"""Validate JSON formats of Dataset.
Check if JSON data structure is valid.
Parameters
----------
filename_schema : str
Filename of JSON schema with full path.
Returns
-------
bool
`True` if JSON validate correctly, `False` if not.
Examples
-------
Check if JSON is valid for Dataverse API upload::
>>> from pyDataverse.models import Dataset
>>> ds = Dataset()
>>> data = {
>>> 'title': 'pyDataverse study 2019',
>>> 'dsDescription': [
>>> {'dsDescriptionValue': 'New study about pyDataverse usage in 2019'}
>>> ]
>>> }
>>> ds.set(data)
>>> print(ds.validate_json())
False
>>> ds.author = [{'authorName': 'LastAuthor1, FirstAuthor1'}]
>>> ds.datasetContact = [{'datasetContactName': 'LastContact1, FirstContact1'}]
>>> ds.subject = ['Engineering']
>>> print(ds.validate_json())
True
"""
if filename_schema is None:
filename_schema = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
self._default_json_schema_filename,
)
assert isinstance(filename_schema, str)
is_valid = True
data_json = self.json(validate=False)
if data_json:
is_valid = validate_data(
json.loads(data_json), filename_schema, file_format="json"
)
if not is_valid:
return False
else:
return False
# check if all required attributes are set
for attr in self.__attr_dict_dv_up_required:
if attr in list(self.__dict__.keys()):
if not self.__getattribute__(attr):
is_valid = False
print("Attribute '{0}' is `False`.".format(attr))
else:
is_valid = False
print("Attribute '{0}' missing.".format(attr))
# check if attributes set are complete where necessary
if "timePeriodCovered" in list(self.__dict__.keys()):
tp_cov = self.__getattribute__("timePeriodCovered")
if tp_cov:
for tp in tp_cov:
if "timePeriodCoveredStart" in tp or "timePeriodCoveredEnd" in tp:
if not (
"timePeriodCoveredStart" in tp
and "timePeriodCoveredEnd" in tp
):
is_valid = False
print("timePeriodCovered attribute missing.")
if "dateOfCollection" in list(self.__dict__.keys()):
d_coll = self.__getattribute__("dateOfCollection")
if d_coll:
for d in d_coll:
if "dateOfCollectionStart" in d or "dateOfCollectionEnd" in d:
if not (
"dateOfCollectionStart" in d and "dateOfCollectionEnd" in d
):
is_valid = False
print("dateOfCollection attribute missing.")
if "author" in list(self.__dict__.keys()):
authors = self.__getattribute__("author")
if authors:
for a in authors:
if (
"authorAffiliation" in a
or "authorIdentifierScheme" in a
or "authorIdentifier" in a
):
if "authorName" not in a:
is_valid = False
print("author attribute missing.")
if "datasetContact" in list(self.__dict__.keys()):
ds_contac = self.__getattribute__("datasetContact")
if ds_contac:
for c in ds_contac:
if "datasetContactAffiliation" in c or "datasetContactEmail" in c:
if "datasetContactName" not in c:
is_valid = False
print("datasetContact attribute missing.")
if "producer" in list(self.__dict__.keys()):
producer = self.__getattribute__("producer")
if producer:
for p in producer:
if (
"producerAffiliation" in p
or "producerAbbreviation" in p
or "producerURL" in p
or "producerLogoURL" in p
):
if not p["producerName"]:
is_valid = False
print("producer attribute missing.")
if "contributor" in list(self.__dict__.keys()):
contributor = self.__getattribute__("contributor")
if contributor:
for c in contributor:
if "contributorType" in c:
if "contributorName" not in c:
is_valid = False
print("contributor attribute missing.")
if "distributor" in list(self.__dict__.keys()):
distributor = self.__getattribute__("distributor")
if distributor:
for d in distributor:
if (
"distributorAffiliation" in d
or "distributorAbbreviation" in d
or "distributorURL" in d
or "distributorLogoURL" in d
):
if "distributorName" not in d:
is_valid = False
print("distributor attribute missing.")
if "geographicBoundingBox" in list(self.__dict__.keys()):
bbox = self.__getattribute__("geographicBoundingBox")
if bbox:
for b in bbox:
if b:
if not (
"westLongitude" in b
and "eastLongitude" in b
and "northLongitude" in b
and "southLongitude" in b
):
is_valid = False
print("geographicBoundingBox attribute missing.")
assert isinstance(is_valid, bool)
return is_valid
[docs] def from_json(
self, json_str, data_format=None, validate=True, filename_schema=None
):
"""Import Dataset metadata from JSON file.
Parses in the metadata of a Dataset from different JSON formats.
Parameters
----------
json_str : str
JSON string to be imported.
data_format : str
Data formats available for import. See `_allowed_json_formats`.
validate : bool
`True`, if imported JSON should be validated against a JSON
schema file. `False`, if JSON string should be imported directly and
not checked if valid.
filename_schema : str
Filename of JSON schema with full path.
Examples
-------
Set Dataverse attributes via flat :class:`dict`::
>>> from pyDataverse.models import Dataset
>>> ds = Dataset()
>>> ds.from_json('tests/data/dataset_upload_min_default.json')
>>> ds.title
'Darwin's Finches'
"""
assert isinstance(json_str, str)
json_dict = json.loads(json_str)
assert isinstance(json_dict, dict)
assert isinstance(validate, bool)
if data_format is None:
data_format = self._default_json_format
assert isinstance(data_format, str)
assert data_format in self._allowed_json_formats
if filename_schema is None:
filename_schema = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
self._default_json_schema_filename,
)
assert isinstance(filename_schema, str)
data = {}
if data_format == "dataverse_upload":
if validate:
validate_data(json_dict, filename_schema, file_format="json")
# dataset
# get first level metadata and parse it automatically
for key, val in json_dict["datasetVersion"].items():
if not key == "metadataBlocks":
if key in self.__attr_import_dv_up_datasetVersion_values:
data[key] = val
else:
print(
"Attribute {0} not valid for import (format={1}).".format(
key, data_format
)
)
if "metadataBlocks" in json_dict["datasetVersion"]:
# citation
if "citation" in json_dict["datasetVersion"]["metadataBlocks"]:
citation = json_dict["datasetVersion"]["metadataBlocks"]["citation"]
if "displayName" in citation:
data["citation_displayName"] = citation["displayName"]
for field in citation["fields"]:
if (
field["typeName"]
in self.__attr_import_dv_up_citation_fields_values
):
data[field["typeName"]] = field["value"]
elif (
field["typeName"]
in self.__attr_import_dv_up_citation_fields_arrays
):
data[field["typeName"]] = self.__parse_field_array(
field["value"],
self.__attr_import_dv_up_citation_fields_arrays[
field["typeName"]
],
)
elif field["typeName"] == "series":
data["series"] = {}
if "seriesName" in field["value"]:
data["series"]["seriesName"] = field["value"][
"seriesName"
]["value"]
if "seriesInformation" in field["value"]:
data["series"]["seriesInformation"] = field["value"][
"seriesInformation"
]["value"]
else:
print(
"Attribute {0} not valid for import (dv_up).".format(
field["typeName"]
)
)
else:
# TODO: Exception
pass
# geospatial
if "geospatial" in json_dict["datasetVersion"]["metadataBlocks"]:
geospatial = json_dict["datasetVersion"]["metadataBlocks"][
"geospatial"
]
if "displayName" in geospatial:
self.__setattr__(
"geospatial_displayName", geospatial["displayName"]
)
for field in geospatial["fields"]:
if (
field["typeName"]
in self.__attr_import_dv_up_geospatial_fields_values
):
data[field["typeName"]] = field["value"]
elif (
field["typeName"]
in self.__attr_import_dv_up_geospatial_fields_arrays
):
data[field["typeName"]] = self.__parse_field_array(
field["value"],
self.__attr_import_dv_up_geospatial_fields_arrays[
field["typeName"]
],
)
else:
print(
"Attribute {0} not valid for import (dv_up).".format(
field["typeName"]
)
)
else:
# TODO: Exception
pass
# socialscience
if "socialscience" in json_dict["datasetVersion"]["metadataBlocks"]:
socialscience = json_dict["datasetVersion"]["metadataBlocks"][
"socialscience"
]
if "displayName" in socialscience:
self.__setattr__(
"socialscience_displayName", socialscience["displayName"],
)
for field in socialscience["fields"]:
if (
field["typeName"]
in self.__attr_import_dv_up_socialscience_fields_values
):
data[field["typeName"]] = field["value"]
elif field["typeName"] == "targetSampleSize":
data["targetSampleSize"] = {}
if "targetSampleActualSize" in field["value"]:
data["targetSampleSize"][
"targetSampleActualSize"
] = field["value"]["targetSampleActualSize"]["value"]
if "targetSampleSizeFormula" in field["value"]:
data["targetSampleSize"][
"targetSampleSizeFormula"
] = field["value"]["targetSampleSizeFormula"]["value"]
elif field["typeName"] == "socialScienceNotes":
data["socialScienceNotes"] = {}
if "socialScienceNotesType" in field["value"]:
data["socialScienceNotes"][
"socialScienceNotesType"
] = field["value"]["socialScienceNotesType"]["value"]
if "socialScienceNotesSubject" in field["value"]:
data["socialScienceNotes"][
"socialScienceNotesSubject"
] = field["value"]["socialScienceNotesSubject"]["value"]
if "socialScienceNotesText" in field["value"]:
data["socialScienceNotes"][
"socialScienceNotesText"
] = field["value"]["socialScienceNotesText"]["value"]
else:
print(
"Attribute {0} not valid for import (dv_up).".format(
field["typeName"]
)
)
else:
# TODO: Exception
pass
# journal
if "journal" in json_dict["datasetVersion"]["metadataBlocks"]:
journal = json_dict["datasetVersion"]["metadataBlocks"]["journal"]
if "displayName" in journal:
self.__setattr__("journal_displayName", journal["displayName"])
for field in journal["fields"]:
if (
field["typeName"]
in self.__attr_import_dv_up_journal_fields_values
):
data[field["typeName"]] = field["value"]
elif (
field["typeName"]
in self.__attr_import_dv_up_journal_fields_arrays
):
data[field["typeName"]] = self.__parse_field_array(
field["value"],
self.__attr_import_dv_up_journal_fields_arrays[
field["typeName"]
],
)
else:
print(
"Attribute {0} not valid for import (dv_up).".format(
field["typeName"]
)
)
else:
# TODO: Exception
pass
elif data_format == "dataverse_download":
print("INFO: Not implemented yet.")
elif data_format == "dspace":
print("INFO: Not implemented yet.")
elif data_format == "custom":
print("INFO: Not implemented yet.")
self.set(data)
def __parse_field_array(self, data, attr_list):
"""Parse arrays of Dataset upload format.
Parameters
----------
data : list
List of dictionaries of a specific Dataverse API metadata field.
attr_list : list
List of attributes to be parsed.
Returns
-------
list
List of :class:`dict`s with parsed out key-value pairs.
"""
assert isinstance(data, list)
assert isinstance(attr_list, list)
data_tmp = []
for d in data:
tmp_dict = {}
for key, val in d.items():
if key in attr_list:
tmp_dict[key] = val["value"]
else:
print("Key '{0}' not in attribute list".format(key))
data_tmp.append(tmp_dict)
assert isinstance(data_tmp, list)
return data_tmp
def __generate_field_arrays(self, key, sub_keys):
"""Generate dicts for array attributes of Dataverse API metadata upload.
Parameters
----------
key : str
Name of attribute.
sub_keys : list
List of keys to be created.
Returns
-------
list
List of filled :class:`dict`s of metadata for Dataverse API upload.
"""
assert isinstance(key, str)
assert isinstance(sub_keys, list)
# check if attribute exists
tmp_list = []
if self.__getattribute__(key):
attr = self.__getattribute__(key)
# loop over list of attribute dict
for d in attr:
tmp_dict = {}
# iterate over key-value pairs
for k, v in d.items():
# check if key is in attribute list
if k in sub_keys:
multiple = None
type_class = None
if isinstance(v, list):
multiple = True
else:
multiple = False
if k in self.__attr_dict_dv_up_type_class_primitive:
type_class = "primitive"
elif k in self.__attr_dict_dv_up_type_class_compound:
type_class = "compound"
elif (
k in self.__attr_dict_dv_up_type_class_controlled_vocabulary
):
type_class = "controlledVocabulary"
tmp_dict[k] = {}
tmp_dict[k]["typeName"] = k
tmp_dict[k]["typeClass"] = type_class
tmp_dict[k]["multiple"] = multiple
tmp_dict[k]["value"] = v
tmp_list.append(tmp_dict)
assert isinstance(tmp_list, list)
return tmp_list
[docs] def json(self, data_format=None, validate=True, filename_schema=None):
"""Create Dataset JSON from attributes.
Parameters
----------
format : str
Data formats to be validated. See `_allowed_json_formats`.
validate : bool
`True`, if created JSON should be validated against a JSON schema
file. `False`, if JSON string should be created and not checked if
valid.
filename_schema : str
Filename of JSON schema with full path.
Returns
-------
str
The data as a JSON string.
"""
assert isinstance(validate, bool)
if data_format is None:
data_format = self._default_json_format
assert isinstance(data_format, str)
assert data_format in self._allowed_json_formats
if filename_schema is None:
filename_schema = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
self._default_json_schema_filename,
)
assert isinstance(filename_schema, str)
data = {}
if data_format == "dataverse_upload":
data_dict = self.get()
data["datasetVersion"] = {}
data["datasetVersion"]["metadataBlocks"] = {}
citation = {}
citation["fields"] = []
# dataset
# Generate first level attributes
for attr in self.__attr_import_dv_up_datasetVersion_values:
if attr in data_dict:
data["datasetVersion"][attr] = data_dict[attr]
# citation
if "citation_displayName" in data_dict:
citation["displayName"] = data_dict["citation_displayName"]
# Generate first level attributes
for attr in self.__attr_import_dv_up_citation_fields_values:
if attr in data_dict:
v = data_dict[attr]
if isinstance(v, list):
multiple = True
else:
multiple = False
if attr in self.__attr_dict_dv_up_type_class_primitive:
type_class = "primitive"
elif attr in self.__attr_dict_dv_up_type_class_compound:
type_class = "compound"
elif (
attr in self.__attr_dict_dv_up_type_class_controlled_vocabulary
):
type_class = "controlledVocabulary"
citation["fields"].append(
{
"typeName": attr,
"multiple": multiple,
"typeClass": type_class,
"value": v,
}
)
# Generate fields attributes
for (key, val,) in self.__attr_import_dv_up_citation_fields_arrays.items():
if key in data_dict:
v = data_dict[key]
citation["fields"].append(
{
"typeName": key,
"multiple": True,
"typeClass": "compound",
"value": self.__generate_field_arrays(key, val),
}
)
# Generate series attributes
if "series" in data_dict:
series = data_dict["series"]
tmp_dict = {}
if "seriesName" in series:
if series["seriesName"] is not None:
tmp_dict["seriesName"] = {}
tmp_dict["seriesName"]["typeName"] = "seriesName"
tmp_dict["seriesName"]["multiple"] = False
tmp_dict["seriesName"]["typeClass"] = "primitive"
tmp_dict["seriesName"]["value"] = series["seriesName"]
if "seriesInformation" in series:
if series["seriesInformation"] is not None:
tmp_dict["seriesInformation"] = {}
tmp_dict["seriesInformation"]["typeName"] = "seriesInformation"
tmp_dict["seriesInformation"]["multiple"] = False
tmp_dict["seriesInformation"]["typeClass"] = "primitive"
tmp_dict["seriesInformation"]["value"] = series[
"seriesInformation"
]
citation["fields"].append(
{
"typeName": "series",
"multiple": False,
"typeClass": "compound",
"value": tmp_dict,
}
)
# geospatial
for attr in (
self.__attr_import_dv_up_geospatial_fields_values
+ list(self.__attr_import_dv_up_geospatial_fields_arrays.keys())
+ ["geospatial_displayName"]
):
if attr in data_dict:
geospatial = {}
if attr != "geospatial_displayName":
geospatial["fields"] = []
break
if "geospatial_displayName" in data_dict:
geospatial["displayName"] = data_dict["geospatial_displayName"]
# Generate first level attributes
for attr in self.__attr_import_dv_up_geospatial_fields_values:
if attr in data_dict:
v = data_dict[attr]
if isinstance(v, list):
multiple = True
else:
multiple = False
if attr in self.__attr_dict_dv_up_type_class_primitive:
type_class = "primitive"
elif attr in self.__attr_dict_dv_up_type_class_compound:
type_class = "compound"
elif (
attr in self.__attr_dict_dv_up_type_class_controlled_vocabulary
):
type_class = "controlledVocabulary"
geospatial["fields"].append(
{
"typeName": attr,
"multiple": multiple,
"typeClass": type_class,
"value": v,
}
)
# Generate fields attributes
for (
key,
val,
) in self.__attr_import_dv_up_geospatial_fields_arrays.items():
if key in data_dict:
geospatial["fields"].append(
{
"typeName": key,
"multiple": True,
"typeClass": "compound",
"value": self.__generate_field_arrays(key, val),
}
)
# socialscience
for attr in self.__attr_import_dv_up_socialscience_fields_values + [
"socialscience_displayName"
]:
if attr in data_dict:
socialscience = {}
if attr != "socialscience_displayName":
socialscience["fields"] = []
break
if "socialscience_displayName" in data_dict:
socialscience["displayName"] = data_dict["socialscience_displayName"]
# Generate first level attributes
for attr in self.__attr_import_dv_up_socialscience_fields_values:
if attr in data_dict:
v = data_dict[attr]
if isinstance(v, list):
multiple = True
else:
multiple = False
if attr in self.__attr_dict_dv_up_type_class_primitive:
type_class = "primitive"
elif attr in self.__attr_dict_dv_up_type_class_compound:
type_class = "compound"
elif (
attr in self.__attr_dict_dv_up_type_class_controlled_vocabulary
):
type_class = "controlledVocabulary"
socialscience["fields"].append(
{
"typeName": attr,
"multiple": multiple,
"typeClass": type_class,
"value": v,
}
)
# Generate targetSampleSize attributes
if "targetSampleSize" in data_dict:
target_sample_size = data_dict["targetSampleSize"]
tmp_dict = {}
if "targetSampleActualSize" in target_sample_size:
if target_sample_size["targetSampleActualSize"] is not None:
tmp_dict["targetSampleActualSize"] = {}
tmp_dict["targetSampleActualSize"][
"typeName"
] = "targetSampleActualSize"
tmp_dict["targetSampleActualSize"]["multiple"] = False
tmp_dict["targetSampleActualSize"]["typeClass"] = "primitive"
tmp_dict["targetSampleActualSize"][
"value"
] = target_sample_size["targetSampleActualSize"]
if "targetSampleSizeFormula" in target_sample_size:
if target_sample_size["targetSampleSizeFormula"] is not None:
tmp_dict["targetSampleSizeFormula"] = {}
tmp_dict["targetSampleSizeFormula"][
"typeName"
] = "targetSampleSizeFormula"
tmp_dict["targetSampleSizeFormula"]["multiple"] = False
tmp_dict["targetSampleSizeFormula"]["typeClass"] = "primitive"
tmp_dict["targetSampleSizeFormula"][
"value"
] = target_sample_size["targetSampleSizeFormula"]
socialscience["fields"].append(
{
"typeName": "targetSampleSize",
"multiple": False,
"typeClass": "compound",
"value": tmp_dict,
}
)
# Generate socialScienceNotes attributes
if "socialScienceNotes" in data_dict:
social_science_notes = data_dict["socialScienceNotes"]
tmp_dict = {}
if "socialScienceNotesType" in social_science_notes:
if social_science_notes["socialScienceNotesType"] is not None:
tmp_dict["socialScienceNotesType"] = {}
tmp_dict["socialScienceNotesType"][
"typeName"
] = "socialScienceNotesType"
tmp_dict["socialScienceNotesType"]["multiple"] = False
tmp_dict["socialScienceNotesType"]["typeClass"] = "primitive"
tmp_dict["socialScienceNotesType"][
"value"
] = social_science_notes["socialScienceNotesType"]
if "socialScienceNotesSubject" in social_science_notes:
if social_science_notes["socialScienceNotesSubject"] is not None:
tmp_dict["socialScienceNotesSubject"] = {}
tmp_dict["socialScienceNotesSubject"][
"typeName"
] = "socialScienceNotesSubject"
tmp_dict["socialScienceNotesSubject"]["multiple"] = False
tmp_dict["socialScienceNotesSubject"]["typeClass"] = "primitive"
tmp_dict["socialScienceNotesSubject"][
"value"
] = social_science_notes["socialScienceNotesSubject"]
if "socialScienceNotesText" in social_science_notes:
if social_science_notes["socialScienceNotesText"] is not None:
tmp_dict["socialScienceNotesText"] = {}
tmp_dict["socialScienceNotesText"][
"typeName"
] = "socialScienceNotesText"
tmp_dict["socialScienceNotesText"]["multiple"] = False
tmp_dict["socialScienceNotesText"]["typeClass"] = "primitive"
tmp_dict["socialScienceNotesText"][
"value"
] = social_science_notes["socialScienceNotesText"]
socialscience["fields"].append(
{
"typeName": "socialScienceNotes",
"multiple": False,
"typeClass": "compound",
"value": tmp_dict,
}
)
# journal
for attr in (
self.__attr_import_dv_up_journal_fields_values
+ list(self.__attr_import_dv_up_journal_fields_arrays.keys())
+ ["journal_displayName"]
):
if attr in data_dict:
journal = {}
if attr != "journal_displayName":
journal["fields"] = []
break
if "journal_displayName" in data_dict:
journal["displayName"] = data_dict["journal_displayName"]
# Generate first level attributes
for attr in self.__attr_import_dv_up_journal_fields_values:
if attr in data_dict:
v = data_dict[attr]
if isinstance(v, list):
multiple = True
else:
multiple = False
if attr in self.__attr_dict_dv_up_type_class_primitive:
type_class = "primitive"
elif attr in self.__attr_dict_dv_up_type_class_compound:
type_class = "compound"
elif (
attr in self.__attr_dict_dv_up_type_class_controlled_vocabulary
):
type_class = "controlledVocabulary"
journal["fields"].append(
{
"typeName": attr,
"multiple": multiple,
"typeClass": type_class,
"value": v,
}
)
# Generate fields attributes
for (key, val,) in self.__attr_import_dv_up_journal_fields_arrays.items():
if key in data_dict:
journal["fields"].append(
{
"typeName": key,
"multiple": True,
"typeClass": "compound",
"value": self.__generate_field_arrays(key, val),
}
)
data["datasetVersion"]["metadataBlocks"]["citation"] = citation
if "socialscience" in locals():
data["datasetVersion"]["metadataBlocks"][
"socialscience"
] = socialscience
if "geospatial" in locals():
data["datasetVersion"]["metadataBlocks"]["geospatial"] = geospatial
if "journal" in locals():
data["datasetVersion"]["metadataBlocks"]["journal"] = journal
elif data_format == "dspace":
data = None
print("INFO: Not implemented yet.")
elif data_format == "custom":
data = None
print("INFO: Not implemented yet.")
if validate:
validate_data(data, filename_schema)
json_str = json.dumps(data, indent=2)
assert isinstance(json_str, str)
return json_str
[docs]class Datafile(DVObject):
"""Base class for the Dataverse data type `Datafile`.
Attributes
----------
_default_json_format : str
Default JSON data format.
_default_json_schema_filename : str
Default JSON schema filename.
_allowed_json_formats : list
List of all possible JSON data formats.
_json_dataverse_upload_attr : list
List of all attributes to be exported in :func:`json`.
"""
def __init__(self, data=None):
"""Init :class:`Datafile()`.
Inherits attributes from parent :class:`DVObject()`
Parameters
----------
data : dict
Flat dictionary. All keys will be mapped to a similar
named attribute and it's value.
Examples
-------
Create a Datafile::
>>> from pyDataverse.models import Datafile
>>> df = Datafile()
>>> print(df._default_json_schema_filename)
'schemas/json/datafile_upload_schema.json'
"""
self._internal_attributes = ["_Datafile" + attr for attr in INTERNAL_ATTRIBUTES]
super().__init__(data=data)
self._default_json_format = "dataverse_upload"
self._default_json_schema_filename = "schemas/json/datafile_upload_schema.json"
self._allowed_json_formats = ["dataverse_upload", "dataverse_download"]
self._json_dataverse_upload_attr = [
"description",
"categories",
"restrict",
"label",
"directoryLabel",
"pid",
"filename",
]