Source code for opendataformat.write_odf

# -*- coding: utf-8 -*-
"""
Created on Thu Oct 17 12:25:16 2024

@author: thartl
"""

import pandas as pd
import os
import tempfile
import xml.etree.ElementTree as ET
import shutil
import zipfile
import xml.dom.minidom

"""

"""

[docs] def write_odf(x, path, languages = "all"): """ Write a pandas DataFrame or Series to an Open Data Format (ODF) file. This function saves the provided pandas dataframe (`x`) to an ODF file, including metadata stored in its `attrs` attribute. Metadata can optionally be filtered by language. Parameters ---------- x : pandas.DataFrame or pandas.Series The pandas object to be saved to the ODF file. It should have metadata stored in the `attrs` attribute for inclusion in the output file metadata.xml. path : str The file path (including filename) where the ODF file will be saved. Ensure the path ends with `.zip` to specify the correct file format. languages : str or list of str, default "all" Specifies which language(s) of metadata to include in the ODF file. Options include: - "all": Include metadata for all available languages. - A single language code (e.g., "en"). - A list of language codes (e.g., ["en", "de"]). Edge cases like empty strings or `None` in the language list are handled gracefully. Returns ------- None The function writes the file to the specified `path` and does not return a value. Raises ------ TypeError If `x` is not a pandas DataFrame or Series. ValueError If `languages` contains invalid values. Notes ----- - Metadata from the attributes (`attrs`) of `x` is included in the file. - Multilingual metadata, if present, is processed according to the `languages` parameter. Examples -------- Write a DataFrame to an ODF file, including all metadata: >>> import opendataformat as odf >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.attrs = {"label_en": "English Label", "label_de": "German Label", "description_en": "Example dataset"} >>> odf.write_odf(df, "output.zip") Write a DataFrame to an ODF file, filtering metadata by language: >>> odf.write_odf(df, "output.zip", languages="en") Write a DataFrame to an ODF file, including metadata for multiple languages: >>> odf.write_odf(df, "output.zip", languages="all") """ if (not isinstance(x, pd.DataFrame)): raise TypeError("Input not a pandas.core.frame.DataFrame") path = os.path.realpath(path) if not path.endswith(".zip"): path = path + ".zip" # convert anlanguages to a list or if languages = ["all"] unlist it if languages != "all" and not isinstance(languages, list): languages = [languages] if isinstance(languages, list) and len(languages) == 1: if languages[0] == "all": languages = languages[0] if isinstance(languages, list) and (None in languages or '' in languages): languages += ["label", "labels", "description"] # Extract the filename from the path using os.path.basename (cross-platform) filename = os.path.basename(path) # Get the path of the system's temporary directory temp_dir = tempfile.gettempdir() # Create the full path for the temporary directory based on filename temp_subdir = os.path.join(temp_dir, filename.split('.')[0]) # Remove the directory if it already exists if os.path.exists(temp_subdir): shutil.rmtree(temp_subdir) # Create the directory os.makedirs(temp_subdir, exist_ok=True) # write raw data as csv to the output folder x.to_csv(temp_dir + "/" + filename.split('.')[0] + "/data.csv", index = False) # Create the root element root = ET.Element("codeBook", { "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xmlns": "ddi:codebook:2_5", "xsi:schemaLocation": "ddi:codebook:2_5 http://www.ddialliance.org/Specification/DDI-Codebook/2.5/XMLSchema/codebook.xsd", "version": "2.5" }) # Add study description section stdyDscr = ET.SubElement(root, "stdyDscr") citation = ET.SubElement(stdyDscr, "citation") titlStmt = ET.SubElement(citation, "titlStmt") ET.SubElement(titlStmt, "titl").text = x.attrs.get('study', "") # Add file description section fileDscr = ET.SubElement(root, "fileDscr") fileTxt = ET.SubElement(fileDscr, "fileTxt") ET.SubElement(fileTxt, "fileName").text = x.attrs.get('dataset', "") fileCitation = ET.SubElement(fileTxt, "fileCitation") titlStmtFile = ET.SubElement(fileCitation, "titlStmt") # Get labels labels = {} for k,v in x.attrs.items(): if (k == "label" or k.startswith("label_")): labels[k] = v # keep only relevant languages if languages != 'all' if isinstance(languages, list): labels = {key: value for key, value in labels.items() if key.split('_')[-1] in languages} first_label = True for key,value in labels.items(): if (len(key.split("_"))==1): lang = "NA" if first_label == True: ET.SubElement(titlStmtFile, "titl").text = value first_label = False else: ET.SubElement(titlStmtFile, "parTitl").text = value else: lang = key.split("_")[1] if first_label == True: ET.SubElement(titlStmtFile, "titl", {"xml:lang": lang}).text = value first_label = False else: ET.SubElement(titlStmtFile, "parTitl", {"xml:lang": lang}).text = value # Get descriptions descriptions = {} for k,v in x.attrs.items(): if (k == "description" or k.startswith("description_")): descriptions[k] = v # keep only relevant languages if languages != 'all' if isinstance(languages, list): descriptions = {key: value for key, value in descriptions.items() if key.split('_')[-1] in languages} for key, value in descriptions.items(): if (len(key.split("_"))==1): lang = "NA" ET.SubElement(fileTxt, "fileCont").text = value else: lang = key.split("_")[1] ET.SubElement(fileTxt, "fileCont", {"xml:lang": lang}).text = value # Add external link to the dataset notes = ET.SubElement(fileDscr, "notes") ET.SubElement(notes, "ExtLink", {"URI": x.attrs.get('url', "")}) # Add data description with variable Metadata dataDscr = ET.SubElement(root, "dataDscr") for col in list(x.columns): var = ET.SubElement(dataDscr, "var", {"name": col}) if x[col].attrs == x.attrs: continue metadata_dict = x[col].attrs # Get labels labels = {} for k,v in metadata_dict.items(): if (k == "label" or k.startswith("label_")): labels[k] = v # keep only relevant languages if languages != 'all' if isinstance(languages, list): labels = {key: value for key, value in labels.items() if key.split('_')[-1] in languages} for key,value in labels.items(): if (len(key.split("_"))==1): lang = "NA" ET.SubElement(var, "labl").text = value else: lang = key.split("_")[1] ET.SubElement(var, "labl", {"xml:lang": lang}).text = value # Add English and German descriptions for the variable descriptions = {} for k,v in metadata_dict.items(): if (k == "description" or k.startswith("description_")): descriptions[k] = v # keep only relevant languages if languages != 'all' if isinstance(languages, list): descriptions = {key: value for key, value in descriptions.items() if key.split('_')[-1] in languages} # Add file content descriptions in multiple languages for key, value in descriptions.items(): if (len(key.split("_"))==1): lang = "NA" ET.SubElement(var, "txt").text = value else: lang = key.split("_")[1] ET.SubElement(var, "txt", {"xml:lang": lang}).text = value valuelabels = {} for k,v in metadata_dict.items(): if (k == "labels" or k.startswith("labels_")): valuelabels[k] = v # keep only relevant languages if languages != 'all' if isinstance(languages, list): valuelabels = {key: value for key, value in valuelabels.items() if key.split('_')[-1] in languages} labelled_values = [] for key1, val1 in valuelabels.items(): for key2 in val1.keys(): if key2 not in labelled_values: labelled_values.append(key2) for val in labelled_values: catgry = ET.SubElement(var, "catgry") ET.SubElement(catgry, "catValu").text = val for key, value in valuelabels.items(): if (len(key.split("_"))==1): lang = "NA" ET.SubElement(catgry, "labl").text = value[val] else: lang = key.split("_")[1] ET.SubElement(catgry, "labl", {"xml:lang": lang}).text = value[val] # Add variable format ET.SubElement(var, "varFormat", {"type": x[col].attrs.get('type', "")}) # Add external link to the variable var_notes = ET.SubElement(var, "notes") ET.SubElement(var_notes, "ExtLink", {"URI": x[col].attrs.get('url', "")}) # Add indentations and line breaks # Convert the ElementTree to a string xml_str = ET.tostring(root, encoding='UTF-8') # Use minidom to pretty-print the XML dom = xml.dom.minidom.parseString(xml_str) pretty_xml = dom.toprettyxml(indent=" ") # write xml to a file with open(temp_dir + "/" + filename.split('.')[0] + "/metadata.xml", "w", encoding = "UTF-8") as f: f.write(pretty_xml) dataset_dir = os.path.join(temp_dir, filename.split('.')[0]) zip_path = os.path.join(path) # Create a zip file and add files to it with zipfile.ZipFile(zip_path, 'w') as zipf: zipf.write(os.path.join(dataset_dir, 'metadata.xml'), arcname='metadata.xml') zipf.write(os.path.join(dataset_dir, 'data.csv'), arcname='data.csv') print(f"File sucessfully written to {zip_path}.")