Source code for pyDataverse.utils

"""Helper functions."""
import csv
import json
import os
import pickle

from jsonschema import validate


CSV_JSON_COLS = [
    "otherId",
    "series",
    "author",
    "dsDescription",
    "subject",
    "keyword",
    "topicClassification",
    "language",
    "grantNumber",
    "dateOfCollection",
    "kindOfData",
    "dataSources",
    "otherReferences",
    "contributor",
    "relatedDatasets",
    "relatedMaterial",
    "datasetContact",
    "distributor",
    "producer",
    "publication",
    "software",
    "timePeriodCovered",
    "geographicUnit",
    "geographicBoundingBox",
    "geographicCoverage",
    "socialScienceNotes",
    "unitOfAnalysis",
    "universe",
    "targetSampleActualSize",
    "categories",
]


[docs]def read_file(filename, mode="r", encoding="utf-8"): """Read in a file. Parameters ---------- filename : str Filename with full path. mode : str Read mode of file. Defaults to `r`. See more at https://docs.python.org/3.5/library/functions.html#open Returns ------- str Returns data as string. """ assert isinstance(filename, str) assert isinstance(mode, str) assert isinstance(encoding, str) with open(filename, mode, encoding=encoding) as f: data = f.read() assert isinstance(data, str) return data
[docs]def write_file(filename, data, mode="w", encoding="utf-8"): """Write data in a file. Parameters ---------- filename : str Filename with full path. data : str Data to be stored. mode : str Read mode of file. Defaults to `w`. See more at https://docs.python.org/3.5/library/functions.html#open encoding : str Character encoding of file. Defaults to 'utf-8'. """ assert isinstance(filename, str) assert isinstance(data, str) assert isinstance(mode, str) assert isinstance(encoding, str) with open(filename, mode, encoding=encoding) as f: f.write(data)
[docs]def read_json(filename: str, mode: str = "r", encoding: str = "utf-8") -> dict: """Read in a json file. See more about the json module at https://docs.python.org/3.5/library/json.html Parameters ---------- filename : str Filename with full path. mode : str Read mode of file. Defaults to `w`. See more at https://docs.python.org/3.5/library/functions.html#open encoding : str Character encoding of file. Defaults to 'utf-8'. Returns ------- dict Data as a json-formatted string. """ # TODO: add kwargs with open(filename, mode, encoding=encoding) as f: data = json.load(f) return data
[docs]def write_json(filename, data, mode="w", encoding="utf-8"): """Write data to a json file. Parameters ---------- filename : str Filename with full path. data : dict Data to be written in the JSON file. mode : str Write mode of file. Defaults to `w`. See more at https://docs.python.org/3/library/functions.html#open encoding : str Character encoding of file. Defaults to 'utf-8'. """ with open(filename, mode, encoding=encoding) as f: json.dump(data, f, indent=2)
[docs]def read_pickle(filename): """Read in pickle file. See more at `pickle <https://docs.python.org/3/library/pickle.html>`_. Parameters ---------- filename : str Full filename with path of file. Returns ------- dict Data object. """ assert isinstance(filename, str) with open(filename, "rb") as f: data = pickle.load(f) assert isinstance(data, dict) return data
[docs]def write_pickle(filename, data): """Write data in pickle file. See more at `pickle <https://docs.python.org/3/library/pickle.html>`_. Parameters ---------- filename : str Full filename with path of file. data : dict Data to write in pickle file. """ assert isinstance(filename, str) assert isinstance(data, dict) with open(filename, "wb") as f: pickle.dump(data, f)
[docs]def read_csv(filename, newline="", delimiter=",", quotechar='"', encoding="utf-8"): """Read in a CSV file. See more at `csv <https://docs.python.org/3/library/csv.html>`_. Parameters ---------- filename : str Full filename with path of file. newline : str Newline character. delimiter : str Cell delimiter of CSV file. Defaults to ';'. quotechar : str Quote-character of CSV file. Defaults to '"'. encoding : str Character encoding of file. Defaults to 'utf-8'. Returns ------- reader Reader object, which can be iterated over. """ assert isinstance(filename, str) assert isinstance(newline, str) assert isinstance(delimiter, str) assert isinstance(quotechar, str) assert isinstance(encoding, str) with open(filename, newline=newline, encoding=encoding) as csvfile: csv_reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) assert isinstance(csv_reader, csv.reader) return csv_reader
[docs]def write_csv( data, filename, newline="", delimiter=",", quotechar='"', encoding="utf-8" ): """Short summary. See more at `csv <https://docs.python.org/3/library/csv.html>`_. Parameters ---------- data : list List of :class:`dict`. Key is column, value is cell content. filename : str Full filename with path of file. newline : str Newline character. delimiter : str Cell delimiter of CSV file. Defaults to ';'. quotechar : str Quote-character of CSV file. Defaults to '"'. encoding : str Character encoding of file. Defaults to 'utf-8'. """ assert isinstance(data, list) assert isinstance(filename, str) assert isinstance(newline, str) assert isinstance(delimiter, str) assert isinstance(quotechar, str) assert isinstance(encoding, str) with open(filename, "w", newline=newline, encoding=encoding) as csvfile: writer = csv.writer(csvfile, delimiter=delimiter, quotechar=quotechar) for row in data: writer.writerow(row)
[docs]def read_csv_as_dicts( filename, newline="", delimiter=",", quotechar='"', encoding="utf-8", remove_prefix=True, prefix="dv.", json_cols=CSV_JSON_COLS, false_values=["FALSE"], true_values=["TRUE"], ): """Read in CSV file into a list of :class:`dict`. This offers an easy import functionality of your data from CSV files. See more at `csv <https://docs.python.org/3/library/csv.html>`_. CSV file structure: 1) The header row contains the column names. 2) A row contains one dataset 3) A column contains one specific attribute. Recommendation: Name the column name the way you want the attribute to be named later in your Dataverse object. See the `pyDataverse templates <https://github.com/GDCC/pyDataverse_templates>`_ for this. The created :class:`dict` can later be used for the `set()` function to create Dataverse objects. Parameters ---------- filename : str Filename with full path. newline : str Newline character. delimiter : str Cell delimiter of CSV file. Defaults to ';'. quotechar : str Quote-character of CSV file. Defaults to '"'. encoding : str Character encoding of file. Defaults to 'utf-8'. Returns ------- list List with one :class:`dict` each row. The keys of a :class:`dict` are named after the columen names. """ assert isinstance(filename, str) assert isinstance(newline, str) assert isinstance(delimiter, str) assert isinstance(quotechar, str) assert isinstance(encoding, str) with open(filename, "r", newline=newline, encoding=encoding) as csvfile: reader = csv.DictReader(csvfile, delimiter=delimiter, quotechar=quotechar) data = [] for row in reader: data.append(dict(row)) data_tmp = [] for ds in data: ds_tmp = {} for key, val in ds.items(): if val in false_values: ds_tmp[key] = False ds_tmp[key] = True elif val in true_values: ds_tmp[key] = True else: ds_tmp[key] = val data_tmp.append(ds_tmp) data = data_tmp if remove_prefix: data_tmp = [] for ds in data: ds_tmp = {} for key, val in ds.items(): if key.startswith(prefix): ds_tmp[key[len(prefix) :]] = val else: ds_tmp[key] = val data_tmp.append(ds_tmp) data = data_tmp if len(json_cols) > 0: data_tmp = [] for ds in data: ds_tmp = {} for key, val in ds.items(): if key in json_cols: ds_tmp[key] = json.loads(val) else: ds_tmp[key] = val data_tmp.append(ds_tmp) data = data_tmp return data
[docs]def write_dicts_as_csv(data, fieldnames, filename, delimiter=",", quotechar='"'): """Write :class:`dict` to a CSV file This offers an easy export functionality of your data to a CSV files. See more at `csv <https://docs.python.org/3/library/csv.html>`_. Parameters ---------- data : dict Dictionary with columns as keys, to be written in the CSV file. fieldnames : list Sequence of keys that identify the order of the columns. filename : str Filename with full path. delimiter : str Cell delimiter of CSV file. Defaults to ';'. quotechar : str Quote-character of CSV file. Defaults to '"'. """ assert isinstance(data, str) assert isinstance(fieldnames, list) assert isinstance(filename, str) assert isinstance(delimiter, str) assert isinstance(quotechar, str) with open(filename, "w", newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for d in data: for key, val in d.items(): if isinstance(val, dict) or isinstance(val, list): d[key] = json.dump(val) writer.writerow(d)
[docs]def clean_string(string): """Clean a string. Trims whitespace. Parameters ---------- str : str String to be cleaned. Returns ------- string Cleaned string. """ assert isinstance(string, str) clean_str = string.strip() clean_str = clean_str.replace(" ", " ") assert isinstance(clean_str, str) return clean_str
[docs]def validate_data(data: dict, filename_schema: str, file_format: str = "json") -> bool: """Validate data against a schema. Parameters ---------- data : dict Data to be validated. filename_schema : str Filename with full path of the schema file. file_format : str File format to be validated. Returns ------- bool `True` if data was validated, `False` if not. """ assert isinstance(data, dict) assert isinstance(filename_schema, str) assert isinstance(file_format, str) if file_format == "json": validate(instance=data, schema=read_json(filename_schema)) return True elif file_format == "xml": print("INFO: Not implemented yet.") return False else: print("WARNING: No valid format passed.") return False
[docs]def create_dataverse_url(base_url, identifier): """Creates URL of Dataverse. Example: https://data.aussda.at/dataverse/autnes Parameters ---------- base_url : str Base URL of Dataverse instance identifier : str Can either be a dataverse id (long), a dataverse alias (more robust), or the special value ``:root``. Returns ------- str URL of the dataverse """ assert isinstance(base_url, str) assert isinstance(identifier, str) base_url = base_url.rstrip("/") url = "{0}/dataverse/{1}".format(base_url, identifier) assert isinstance(url, str) return url
[docs]def create_dataset_url(base_url, identifier, is_pid): """Creates URL of Dataset. Example: https://data.aussda.at/dataset.xhtml?persistentId=doi:10.11587/CCESLK Parameters ---------- base_url : str Base URL of Dataverse instance identifier : str Identifier of the dataset. Can be dataset id or persistent identifier of the dataset (e. g. doi). is_pid : bool ``True`` to use persistent identifier. ``False``, if not. Returns ------- str URL of the dataset """ assert isinstance(base_url, str) assert isinstance(identifier, str) assert isinstance(is_pid, bool) base_url = base_url.rstrip("/") if is_pid: url = "{0}/dataset.xhtml?persistentId={1}".format(base_url, identifier) else: url = "{0}/dataset.xhtml?id{1}".format(base_url, identifier) assert isinstance(url, str) return url
[docs]def create_datafile_url(base_url, identifier, is_filepid): """Creates URL of Datafile. Example - File ID: https://data.aussda.at/file.xhtml?persistentId=doi:10.11587/CCESLK/5RH5GK Parameters ---------- base_url : str Base URL of Dataverse instance identifier : str Identifier of the datafile. Can be datafile id or persistent identifier of the datafile (e. g. doi). is_filepid : bool ``True`` to use persistent identifier. ``False``, if not. Returns ------- str URL of the datafile """ assert isinstance(base_url, str) assert isinstance(identifier, str) base_url = base_url.rstrip("/") if is_filepid: url = "{0}/file.xhtml?persistentId={1}".format(base_url, identifier) else: url = "{0}/file.xhtml?fileId={1}".format(base_url, identifier) assert isinstance(url, str) return url
[docs]def dataverse_tree_walker( data: list, dv_keys: list = ["dataverse_id", "dataverse_alias"], ds_keys: list = ["dataset_id", "pid"], df_keys: list = ["datafile_id", "filename", "pid", "label"], ) -> tuple: """Walk through a Dataverse tree by get_children(). Recursively walk through the tree structure returned by ``get_children()`` and extract the keys needed. Parameters ---------- data : dict Tree data structure returned by ``get_children()``. dv_keys : list List of keys to be extracted from each Dataverse element. ds_keys : list List of keys to be extracted from each Dataset element. df_keys : list List of keys to be extracted from each Datafile element. Returns ------- tuple (List of Dataverse, List of Datasets, List of Datafiles) """ dataverses = [] datasets = [] datafiles = [] if type(data) == list: for elem in data: dv, ds, df = dataverse_tree_walker(elem) dataverses += dv datasets += ds datafiles += df elif type(data) == dict: if data["type"] == "dataverse": dv_tmp = {} for key in dv_keys: if key in data: dv_tmp[key] = data[key] dataverses.append(dv_tmp) elif data["type"] == "dataset": ds_tmp = {} for key in ds_keys: if key in data: ds_tmp[key] = data[key] datasets.append(ds_tmp) elif data["type"] == "datafile": df_tmp = {} for key in df_keys: if key in data: df_tmp[key] = data[key] datafiles.append(df_tmp) if "children" in data: if len(data["children"]) > 0: dv, ds, df = dataverse_tree_walker(data["children"]) dataverses += dv datasets += ds datafiles += df return dataverses, datasets, datafiles
[docs]def save_tree_data( dataverses: list, datasets: list, datafiles: list, filename_dv: str = "dataverses.json", filename_ds: str = "datasets.json", filename_df: str = "datafiles.json", filename_md: str = "metadata.json", ) -> None: """Save lists from data returend by ``dv_tree_walker``. Collect lists of Dataverses, Datasets and Datafiles and save them in seperated JSON files. Parameters ---------- data : dict Tree data structure returned by ``get_children()``. filename_dv : str Filename with full path for the Dataverse JSON file. filename_ds : str Filename with full path for the Dataset JSON file. filename_df : str Filename with full path for the Datafile JSON file. filename_md : str Filename with full path for the metadata JSON file. """ if os.path.isfile(filename_dv): os.remove(filename_dv) if os.path.isfile(filename_ds): os.remove(filename_ds) if os.path.isfile(filename_df): os.remove(filename_df) if len(dataverses) > 0: write_json(filename_dv, dataverses) if len(datasets) > 0: write_json(filename_ds, datasets) if len(datafiles) > 0: write_json(filename_df, datafiles) metadata = { "dataverses": len(dataverses), "datasets": len(datasets), "datafiles": len(datafiles), } write_json(filename_md, metadata) print(f"- Dataverses: {len(dataverses)}") print(f"- Datasets: {len(datasets)}") print(f"- Datafiles: {len(datafiles)}")