Source code for qmmd.qmcalc.tabulate

import re
from os import listdir
from os.path import isdir
from typing import Any, Dict, List, Optional, Union

import pandas as pd
from xlsxwriter.workbook import Workbook



[docs]
def sortNatural(targetList: List[str]) -> List[str]:
    """
    Sort a given list in a more natural way (e.g., 'abc2' before 'abc10').

    Parameters
    ----------
    targetList : List[str]
        List of strings to be sorted.

    Returns
    -------
    List[str]
        Naturally sorted list.
    """
    tryConvertNumeric = lambda text: float(text) if text.isdigit() else text
    alphanum = lambda key: [
        tryConvertNumeric(c) for c in re.split(r"([-+]?[0-9]*\.?[0-9]*)", key)
    ]
    targetList.sort(key=alphanum)
    return targetList




[docs]
def replaceMultiple(str1: str, strsToReplace: List[str], str2: str) -> str:
    """
    Replace multiple strings in str1 by str2.

    Parameters
    ----------
    str1 : str
        Target string.
    strsToReplace : List[str]
        List of strings to be replaced.
    str2 : str
        Replacement string.

    Returns
    -------
    str
        Updated string.
    """
    for strToReplace in strsToReplace:
        if strToReplace in str1:
            str1 = str1.replace(strToReplace, str2)
    return str1




[docs]
def findVal(lineList: List[str], targetStr: List[str]) -> Union[float, str]:
    """
    Find the values of interest from Gaussian output files.

    Parameters
    ----------
    lineList : List[str]
        List of lines from the Gaussian output file (reversed).
    targetStr : List[str]
        List of target strings to search for.

    Returns
    -------
    Union[float, str]
        Found value, either as a float (for energies) or a string.

    Raises
    ------
    Exception
        If the target string is not found in the line list.
    """
    val, isEnergy, isMethod = (
        None,
        "Energies" in targetStr[0] or "Enthalpies" in targetStr[0],
        "%chk" in targetStr[0],
    )
    for string in targetStr:
        for j, line in enumerate(lineList):
            if string in line:
                if isMethod:
                    # Method is typically on the line starting with '#' which is a few lines after %chk
                    # In reversed list, it's a few lines before %chk
                    try:
                        valueInc = lineList[j - 2]
                        val = valueInc.split(" ")[2]
                        break
                    except (IndexError, ValueError):
                        continue
                else:
                    valueInc = line.split(string)[-1].strip()
                    # If it's an archive entry, it might have backslashes
                    if "\\" in valueInc:
                        valueInc = valueInc.split("\\")[0]
                    
                    if isEnergy:
                        try:
                            val = float(valueInc.split("=")[-1].strip().replace(" ", ""))
                            break
                        except ValueError:
                            continue
                    else:
                        # Non-energy values (like NImag)
                        raw_val = valueInc.split("=")[-1].strip()
                        if not raw_val:
                            # Try previous line if it was a multi-line archive entry
                            try:
                                raw_val = lineList[j - 1].split("\\")[0].split("=")[-1].strip()
                            except (IndexError, ValueError):
                                pass
                        
                        try:
                            val = float(raw_val.replace(" ", ""))
                            break
                        except ValueError:
                            # Might be a string value
                            val = raw_val
                            if val:
                                break
                            continue
    if val is None:
        raise Exception("Target string {0} not found!".format(targetStr))
    return val




[docs]
def writeToExcel(inputDirPath: str, verbose: bool = False) -> Workbook:
    """
    Tabulate the quantities of interest from Gaussian .out files to an Excel document.

    Parameters
    ----------
    inputDirPath : str
        Path to the directory containing Gaussian output files.
    verbose : bool, optional
        Whether to display details of the process.

    Returns
    -------
    Workbook
        The xlsxwriter Workbook object.
    """
    groups = [f for f in listdir(inputDirPath) if isdir("{0}/{1}".format(inputDirPath, f))]

    if verbose:
        print(
            "\n# Tabulating values of interest from Gaussian .out files to an Excel sheet..."
        )
        print("\n# Input directory:\n", inputDirPath, "\n\n# Groups:\n", groups, "\n")

    methodList, nameList, moleculeList, conformerList, NImagList, ZList, EList, HList, GList = (
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
    )
    varFillList = [methodList, NImagList, ZList, EList, HList, GList]
    keywordList = (
        ["%chk"],
        ["NI", "Im"],
        ["zero-point Energies"],
        ["HF"],
        ["thermal Enthalpies"],
        ["thermal Free Energies"],
    )
    for name in groups:
        moleculeDir = "{0}/{1}".format(inputDirPath, name)
        print("Molecule:", name)
        try:
            with open("{0}/{1}.out".format(moleculeDir, name), "r") as f:
                lineList = f.readlines()
                lineList.reverse()
                for i, varList in enumerate(varFillList):
                    val = findVal(lineList, keywordList[i])
                    varList.append(val)
                nameList.append(name)
                moleculeList.append(
                    replaceMultiple(
                        name.split("c")[0].replace("_", ""), ["TR", "TSS", "TP"], ""
                    )
                )
                conformerList.append("c" + name.split("c")[-1])
        except FileNotFoundError:
            print("{0}/{1}.out not found!".format(moleculeDir, name))
            continue
    data = {
        "Method": methodList,
        "Name": nameList,
        "Molecule": moleculeList,
        "Conformer": conformerList,
        "NImag": NImagList,
        "Z (Hartree)": ZList,
        "E (Hartree)": EList,
        "H (Hartree)": HList,
        "G (Hartree)": GList,
    }
    df = pd.DataFrame(data)
    nameList = sortNatural(nameList)
    sortedDF = df.set_index("Name").reindex(nameList).reset_index()
    print("# Sorted data frame:\n", sortedDF)

    print("# Writing to Excel sheet...")
    writer = pd.ExcelWriter("{0}/Energies.xlsx".format(inputDirPath), engine="xlsxwriter")
    sortedDF.to_excel(writer, startrow=1, sheet_name="Sheet1", index=False)
    workbook = writer.book
    worksheet = writer.sheets["Sheet1"]
    for i, col in enumerate(sortedDF.columns):
        column_len = sortedDF[col].astype(str).str.len().max()
        column_len = max(column_len, len(col)) + 2
        worksheet.set_column(i, i, column_len)
    writer.close()
    return workbook



if __name__ == "__main__":
    inputDirPath = "/mnt/c/Users/ASUS/Documents/qmmd/src/qmmd/data/exampleXYZs"  # To be modified!
    workbook = writeToExcel(inputDirPath, verbose=True)