Source code for VESIcal.batchfile

import pandas as pd
import os
import sys
import warnings as w

from VESIcal import core
from VESIcal import sample_class

# Turn off chained assignment pandas warning
pd.options.mode.chained_assignment = None  # default='warn'


def rename_duplicates(df, suffix='-duplicate-'):
    appendents = (suffix +
                  df.groupby(level=0).cumcount().astype(str).replace('0', ''))
    appendents = appendents.replace(suffix, '')
    return df.set_index(df.index.astype(str) + appendents)


[docs] class status_bar(object): """Various styles of status bars that display the progress of a calculation within a loop """ def __init__(): pass def status_bar(percent, sample_name=None, btext=None, barLen=20): """ Prints an updating status bar to the terminal or jupyter notebook. Parameters ---------- percent: float Percent value of progress from 0 to 1 sample_name: string Name of the current sample being calculated btext: string Any extra text to display next to status bar barLen: int Length of bar to print """ sys.stdout.write("\r") sys.stdout.write("[{:<{}}] {:.0f}%".format("=" * int(barLen * percent), barLen, percent * 100)) sample_string = str(sample_name) # Set max number of characters in sample name max_name_length = 25 if len(str(sample_name)) >= max_name_length: sample_string = str(sample_name)[0:max_name_length-1] + "..." # Write out sample name and trailing spaces to cover contents of # previous sample names left over on line if sample_name is not None: sys.stdout.write(" Working on sample " + sample_string + " ") if btext is not None: sys.stdout.write(" " + str(btext)) if percent == 1.0: sys.stdout.write("\n") sys.stdout.flush()
# ---------- BATCHFILE CLASS --------- #
[docs] class BatchFile(object): """A batch file with sample names and oxide compositions Attributes ---------- filename: str Path to the batch file, e.g., "my_file.xlsx". This always needs to be passed, even if the user is passing a pandas DataFrame rather than an batch file. If passing a DataFrame, filename should be set to None. File can be excel file (.xlsx) or .csv. sheet_name: str OPTIONAL. For Excel files. Default value is 0 which gets the first sheet in the batch spreadsheet file. This implements the pandas. read_excel() sheet_name parameter. But functionality to read in more than one sheet at a time (e.g., pandas.read_excel(sheet_name=None)) is not yet imlpemented in VESIcal. From the pandas 1.0.4 documentation: Available cases: - Defaults to 0: 1st sheet as a DataFrame - 1: 2nd sheet as a DataFrame - "Sheet1": Load sheet with name “Sheet1” file_type: str OPTIONAL. Default is 'excel', which denotes that passed file has extension .xlsx. Other option is 'csv', which denotes that the passed file has extension .csv. units: str OPTIONAL. Default is 'wtpt_oxides'. String defining whether the oxide composition is given in wt percent ("wtpt_oxides", which is the default), mole oxides (mol_oxides) or mole cations (mol_cations). default_normalization: None or str The type of normalization to apply to the data by default. One of: - None (no normalization) - 'standard' (default): Normalizes an input composition to 100%. - 'fixedvolatiles': Normalizes major element oxides to 100 wt%, including volatiles. The volatile wt% will remain fixed, whilst the other major element oxides are reduced proportionally so that the total is 100 wt%. - 'additionalvolatiles': Normalises major element oxide wt% to 100%, assuming it is volatile-free. If H2O or CO2 are passed to the function, their un-normalized values will be retained in addition to the normalized non-volatile oxides, summing to >100%. default_units str The type of composition to return by default, one of: - wtpt_oxides (default) - mol_oxides - mol_cations label: str OPTIONAL. Default is 'Label'. Name of the column within the passed file referring to sample names. dataframe: pandas DataFrame OPTIONAL. Default is None in which case this argument is ignored. This argument is used when the user wishes to turn a pandas DataFrame into an BatchFile object, for example when user data is already in python rather than being imported from a file. In this case set `dataframe` equal to the dataframe object being passed in. If using this option, pass None to filename. """ def __init__(self, filename, sheet_name=0, file_type='excel', units='wtpt_oxides', label='Label', default_normalization='none', default_units='wtpt_oxides', dataframe=None, **kwargs): """Return a BatchFile object whose parameters are defined here.""" self.units = units self.set_default_normalization(default_normalization) self.set_default_units(default_units) if filename is not None: file_name, file_extension = os.path.splitext(filename) if file_extension == '.xlsx' or file_extension == '.xls': file_type = 'excel' if file_extension == '.csv': file_type = 'csv' if isinstance(sheet_name, str) or isinstance(sheet_name, int): pass else: raise core.InputError("If sheet_name is passed, it must be of " "type str or int. Currently, VESIcal cannot " "import more than one sheet at a time.") # handle data if passed in as existing dataframe or as file if dataframe is not None: data = dataframe if label is not None: data = self.try_set_index(data, label) else: if file_type == 'excel': data = pd.read_excel(filename, sheet_name=sheet_name) data = self.try_set_index(data, label) elif file_type == 'csv': data = pd.read_csv(filename) data = self.try_set_index(data, label) else: raise core.InputError("file_type must be one of \'excel\' or " "\'csv\'.") # Sanitize data inputs data = rename_duplicates(data) # handle any duplicated sample names for column in data: # convert all oxide columns to numeric if column in core.oxides: data[column] = data[column].apply(pd.to_numeric, errors='coerce') data = data.dropna(how='all') # drop any rows that are all NaNs data = data.fillna(0) # fill in any missing data with 0's if 'model' in kwargs: w.warn("You don't need to pass a model here, so it will be " "ignored. You can specify a model when performing " "calculations on your dataset (e.g., " "calculate_dissolved_volatiles())", RuntimeWarning, stacklevel=2) if 'norm' in kwargs: w.warn("We noticed you passed a norm argument here. This does " "nothing. You can normalize your BatchFile and save it to " "a new variable name after import using " "normalize(BatchFileObject). See the documentation for " "more info.", RuntimeWarning, stacklevel=2) total_iron_columns = ["FeOt", "FeOT", "FeOtot", "FeOtotal", "FeOstar", "FeO*"] for name in total_iron_columns: if name in data.columns: if 'FeO' in data.columns: for row in data.itertuples(): if (data.at[row.Index, "FeO"] == 0 and data.at[row.Index, name] > 0): w.warn("Sample " + str(row.Index) + ": " + str(name) + " value of " + str(data.at[row.Index, name]) + " used as FeO. Fe2O3 set to 0.0.", RuntimeWarning, stacklevel=2) data.at[row.Index, "Fe2O3"] = 0.0 data.at[row.Index, "FeO"] = ( data.at[row.Index, name]) else: w.warn("Total iron column " + str(name) + " detected. " + "This column will be treated as FeO. If Fe2O3 " + "data are not given, Fe2O3 will be 0.0. In " + "future, an option to calcualte FeO/Fe2O3 based " + "on fO2 will be implemented.", RuntimeWarning, stacklevel=2) data['FeO'] = data[name] if units == "wtpt_oxides": pass if units == "mol_oxides": data = self._molOxides_to_wtpercentOxides(data) if units == "mol_cations": data = self._molCations_to_wtpercentOxides(data) for column in data: if column in core.oxides: data[column][data[column] < 0] = 0 self.data = data
[docs] def set_default_normalization(self, default_normalization): """ Set the default type of normalization to use with the get_composition() method. Parameters ---------- default_normalization: str The type of normalization to apply to the data. One of: - 'none' (no normalization) - 'standard' (default): Normalizes an input composition to 100%. - 'fixedvolatiles': Normalizes major element oxides to 100 wt%, including volatiles. The volatile wt% will remain fixed, whilst the other major element oxides are reduced proportionally so that the total is 100 wt%. - 'additionalvolatiles': Normalises major element oxide wt% to 100%, assuming it is volatile-free. If H2O or CO2 are passed to the function, their un-normalized values will be retained in addition to the normalized non-volatile oxides, summing to >100%. """ if default_normalization in ['none', 'standard', 'fixedvolatiles', 'additionalvolatiles']: self.default_normalization = default_normalization else: raise core.InputError("The normalization method must be one of " "'none', 'standard', 'fixedvolatiles' " "or 'additionalvolatiles'.")
[docs] def set_default_units(self, default_units): """ Set the default units of composition to return when using the get_composition() method. Parameters ---------- default_units str The type of composition to return, one of: - wtpt_oxides (default) - mol_oxides - mol_cations """ if default_units in ['wtpt_oxides', 'mol_oxides', 'mol_cations']: self.default_units = default_units else: raise core.InputError("The units must be one of 'wtpt_oxides', " "'mol_oxides','mol_cations'.")
[docs] def get_composition(self, species=None, normalization=None, units=None, exclude_volatiles=False, asBatchFile=False): """ Returns a pandas DataFrame containing the compositional information for all samples in the BatchFile object Parameters ---------- species: NoneType or str The name of the oxide or cation to return the concentration of. If NoneType (default) the whole composition of each sample will be returned. If an oxide is passed, the value in wtpt will be returned unless units is set to 'mol_oxides', even if the default units for the sample object are mol_oxides. If an element is passed, the concentration will be returned as mol_cations, unless 'mol_singleO' is specified as units, even if the default units for the sample object are mol_singleO. Unless normalization is specified in the method call, none will be applied. normalization: NoneType or str The type of normalization to apply to the data. One of: - 'none' (no normalization) - 'standard' (default): Normalizes an input composition to 100%. - 'fixedvolatiles': Normalizes major element oxides to 100 wt%, including volatiles. The volatile wt% will remain fixed, whilst the other major element oxides are reduced proportionally so that the total is 100 wt%. - 'additionalvolatiles': Normalises major element oxide wt% to 100%, assuming it is volatile-free. If H2O or CO2 are passed to the function, their un-normalized values will be retained in addition to the normalized non-volatile oxides, summing to >100%. If NoneType is passed the default normalization option will be used (self.default_normalization). units: NoneType or str The units of composition to return, one of: - wtpt_oxides (default) - mol_oxides - mol_cations - mol_singleO If NoneType is passed the default units option will be used (self.default_type). exclude_volatiles bool If True, volatiles will be excluded from the returned composition, prior to normalization and conversion. asBatchFile: bool If True, returns a BatchFile object. If False, returns a pandas.DataFrame object. Returns ------- pandas.DataFrame or BatchFile object All sample information. """ data = self.data.copy() # Fetch the default return types if not specified in function call if normalization is None and species is None: normalization = self.default_normalization if units is None and species is None: units = self.default_units new_compositions = [] sample_names = [] for index, row in data.iterrows(): sample_comp = self.get_sample_composition(index, units=units, asSampleClass=True) new_compositions.append(sample_comp.get_composition( species=species, normalization=normalization, units=units, exclude_volatiles=exclude_volatiles)) sample_names.append(index) if isinstance(new_compositions[0], pd.Series): return_frame = pd.concat( [pd.DataFrame(j) for j in new_compositions], axis=1) return_frame = return_frame.transpose() return_frame["new_index"] = sample_names return_frame = return_frame.set_index("new_index") return_frame.index.name = None elif isinstance(new_compositions[0], float): species_data = {species: new_compositions} return_frame = pd.DataFrame( species_data, index=[name for name in sample_names]) else: return_frame = None if asBatchFile is False: return return_frame else: return BatchFile(filename=None, dataframe=return_frame, label=None)
[docs] def get_data(self, normalization=None, units=None, asBatchFile=False): """ Returns all data stored in a BatchFile object (both compositional and other data). To return only the compositional data, use get_composition(). Parameters ---------- normalization: NoneType or str The type of normalization to apply to the data. One of: - 'none' (no normalization) - 'standard' (default): Normalizes an input composition to 100%. - 'fixedvolatiles': Normalizes major element oxides to 100 wt%, including volatiles. The volatile wt% will remain fixed, whilst the other major element oxides are reduced proportionally so that the total is 100 wt%. - 'additionalvolatiles': Normalises major element oxide wt% to 100%, assuming it is volatile-free. If H2O or CO2 are passed to the function, their un-normalized values will be retained in addition to the normalized non-volatile oxides, summing to >100%. If NoneType is passed the default normalization option will be used (self.default_normalization). units: NoneType or str The units of composition to return, one of: - wtpt_oxides (default) - mol_oxides - mol_cations - mol_singleO If NoneType is passed the default units option will be used (self.default_type). asBatchFile: bool If True, returns a BatchFile object. If False, returns a pandas.DataFrame object. Returns ------- pandas.DataFrame or BatchFile object All sample information. """ data = self.data.copy() # Fetch the default return units if not specified in function call if units is None: units = self.default_units # Fetch the default normalization if not specified in the function call if normalization is None: normalization = self.default_normalization # Grab all compositional data compositional_data = self.get_composition(normalization=normalization, units=units) # Grab all non-compositional data non_compositional_data = data.filter( [col for col in data.columns if col not in core.oxides]) # concatenate both compositional and non-compositional dataframes # into one return_frame = pd.concat([compositional_data, non_compositional_data], axis=1) if asBatchFile is False: return return_frame else: return BatchFile(filename=None, dataframe=return_frame, label=None)
[docs] def get_sample_composition(self, samplename, species=None, normalization=None, units=None, asSampleClass=False): """ Returns oxide composition of a single sample from a user-imported file as a dictionary Parameters ---------- samplename: string Name of the desired sample normalization: NoneType or str The type of normalization to apply to the data. One of: - 'none' (no normalization) - 'standard' (default): Normalizes an input composition to 100%. - 'fixedvolatiles': Normalizes major element oxides to 100 wt%, including volatiles. The volatile wt% will remain fixed, whilst the other major element oxides are reduced proportionally so that the total is 100 wt%. - 'additionalvolatiles': Normalises major element oxide wt% to 100%, assuming it is volatile-free. If H2O or CO2 are passed to the function, their un-normalized values will be retained in addition to the normalized non-volatile oxides, summing to >100%. If NoneType is passed the default normalization option will be used (self.default_normalization). units: NoneType or str The units of composition to return, one of: - wtpt_oxides (default) - mol_oxides - mol_cations - mol_singleO If NoneType is passed the default units option will be used (self.default_type). asSampleClass: bool If True, the sample composition will be returned as a sample class, with default options. In this case any normalization instructions will be ignored. Returns ------- dictionary, float, or sample_class.Sample object Composition of the sample as oxides """ # Fetch the default return types if not specified in function call if normalization is None and species is None: normalization = self.default_normalization if units is None and species is None: units = self.default_units # Check that normalization being chosen is one of the possible options if normalization in [None, 'none', 'standard', 'fixedvolatiles', 'additionalvolatiles']: pass else: raise core.InputError("The normalization method must be one of " "'none', 'standard', 'fixedvolatiles', " "or 'additionalvolatiles'.") data = self.data my_sample = pd.DataFrame(data.loc[samplename]) sample_dict = (my_sample.to_dict()[samplename]) sample_oxides = {} for item, value in sample_dict.items(): if item in core.oxides: sample_oxides.update({item: value}) _sample = sample_class.Sample(sample_oxides) # Get sample composition in terms of any species, units, and # normalization passed return_sample = _sample.get_composition(species=species, units=units, normalization=normalization) if asSampleClass: return sample_class.Sample(return_sample) else: if species is None: return dict(return_sample) elif isinstance(species, str): return return_sample
def _molOxides_to_wtpercentOxides(self, data): for i, row in data.iterrows(): sample_comp = {} for oxide in core.oxides: if oxide in data.columns: sample_comp[oxide] = row[oxide] else: sample_comp[oxide] = 0.0 _sample = sample_class.Sample(sample_comp, units='mol_oxides') _sample_conv = _sample.get_composition() for ox in core.oxides: data.loc[i, oxide] = _sample_conv[oxide] return data def _molCations_to_wtpercentOxides(self, data): for i, row in data.iterrows(): sample_comp = {} for cation in core.oxides_to_cations[core.oxides]: if cation in data.columns: sample_comp[cation] = row[cation] else: sample_comp[cation] = 0.0 _sample = sample_class.Sample(sample_comp, units='mol_cations') _sample_conv = _sample.get_composition() for oxide in core.oxides: data.loc[i, oxide] = _sample_conv[oxide] return data
[docs] def try_set_index(self, dataframe, label): """ Method to handle setting the index column in an BatchFile object. If no column is passed that matches the default index name, then this method will attempt to choose the 'best' column that the user might want to serve as an index column. Parameters ---------- dataframe: pandas DataFrame label: str Name of the column within the passed Excel file referring to sample names. """ _dataframe = dataframe.copy() try: _dataframe = _dataframe.set_index(label) except Exception: label_found = False for col in _dataframe.columns: if col in core.oxides: pass else: _dataframe = _dataframe.set_index(col) label_found = True w.warn("No Label column given, so column '" + str(col) + "' was chosen for you. To choose your own, set " + "label='<column-name>'.", RuntimeWarning, stacklevel=2) break if label_found is False: _dataframe.index.name = 'Label' w.warn("No Label column given, so one was created for you. " "To choose your own, set label='<column-name>'.", RuntimeWarning, stacklevel=2) return _dataframe
[docs] def save_excel(self, filename, calculations, sheet_names=None): """ Saves data calculated by the user in batch processing mode (using the BatchFile class methods) to an organized Excel file, with the original user data plus any calculated data. Parameters ---------- filename: string Name of the file. Extension (.xlsx) should be passed along with the name itself, all in quotes (e.g., 'myfile.xlsx'). calculations: pandas DataFrame or list of pandas DataFrames A single DataFrame or list of DataFrames (e.g., calculated outputs from any of the core BatchFile functions: calculate_dissolved_volatiles, calculate_equilibrium_fluid_comp, and calculate_saturation_pressure). If None, only the original user data will be saved. sheet_names: None, string, or list OPTIONAL. Default value is None. Allows user to set the name of the sheet or sheets written to the Excel file. Returns ------- Creates and saves an Excel file with data from each calculation saved to its own sheet. """ if isinstance(calculations, list): if isinstance(sheet_names, list) or sheet_names is None: pass else: raise core.InputError("If calculations is passed as list, " "sheet_names must also be list of same " "length") elif calculations is None: pass else: calculations = [calculations] with pd.ExcelWriter(filename) as writer: self.data.to_excel(writer, 'Original_User_Data') if isinstance(calculations, list): if sheet_names is None: for n, df in enumerate(calculations): df.to_excel(writer, 'Calc%s' % n) elif isinstance(sheet_names, list): pass else: sheet_names = [sheet_names] if isinstance(sheet_names, list): if len(sheet_names) == len(calculations): pass else: raise core.InputError("calculations and sheet_names " "must have the same length") for i in range(len(calculations)): if isinstance(sheet_names[i], str): calculations[i].to_excel(writer, sheet_names[i]) else: raise core.InputError("if sheet_names is passed, " "it must be list of strings") elif calculations is None: pass return print("Saved " + str(filename))
[docs] def save_csv(self, filenames, calculations, **kwargs): """ Saves data calculated by the user in batch processing mode to a comma-separated values (csv) file. Mirros the pandas.to_csv() method. Any argument that can be passed to pandas.csv() can be passed here. One csv file will be saved for each calculation passed. Parameters ---------- filenames: string or list of strings Name of the file. Extension (.csv) should be passed along with the name itself, all in quotes (e.g., 'myfile.csv'). The number of calculations passed must match the number of filenames passed. If passing more than one, should be passed as a list. calculations: pandas DataFrame or list of pandas DataFrames A single variable or list of variables containing calculated outputs from any of the core BatchFile functions: calculate_dissolved_volatiles, calculate_equilibrium_fluid_comp, and calculate_saturation_pressure. Returns ------- Creates and saves a CSV file or files with data from each calculation saved to its own file. """ if isinstance(filenames, list) is False: filenames = [filenames] if isinstance(calculations, list) is False: calculations = [calculations] if len(filenames) != len(calculations): raise core.InputError("calculations and filenames must have the " "same length") for i in range(len(filenames)): calculations[i].to_csv(filenames[i], **kwargs) print("Saved " + str(filenames[i]))
def from_DataFrame(dataframe, units='wtpt_oxides', label='Label'): """ Transforms any pandas DataFrame object into a VESIcal BatchFile object. Parameters ---------- dataframe: pd.DataFrame object DataFrame object containing samples and oxide compositions. units: str OPTIONAL. Default is 'wtpt_oxides'. String defining whether the oxide composition is given in wt percent ("wtpt_oxides", which is the default), mole fraction oxides ("mol_oxides"), or mole fraction cations ("mol_cations"). label: str OPTIONAL. Default is 'Label'. Name of the column within the passed file referring to sample names. This column will be set as the index column. Returns ------- VESIcal.BatchFile object """ return BatchFile(filename=None, dataframe=dataframe, units=units, label=label)