Source code for ai2_kit.tool.model_devi

from typing import List, Tuple
from ase import Atoms

import pandas as pd

from ai2_kit.core.util import expand_globs
from ai2_kit.core.log import get_logger

logger = get_logger(__name__)

MdArray = List[Tuple[pd.DataFrame, str]]


[docs]class ModelDevi: """ A tool to analyze the deviation of model from model deviation file of deepmd-kit """
[docs] @staticmethod def md_arr_load(*files: str) -> MdArray: """ Load model deviation files """ md_arr = [] for file in expand_globs(files): with open(file, 'r') as f: f.seek(1) # skip the leading '#' df = pd.read_csv(f, delim_whitespace=True) md_arr.append((df, file)) return md_arr
def __init__(self, atoms_arr: List[Atoms], md_arr: MdArray): self._md_df = pd.concat([df for df, _ in md_arr]) self._atoms_arr = atoms_arr self._md_arr = md_arr if len(atoms_arr) != len(self._md_df): raise ValueError("The size of atoms and model deviation records should be the same") self._stats = {} self._grade = {}
[docs] def grade(self, lo: float, hi: float, col: str = 'max_devi_f'): """ Grade atoms based on the deviation of model: the good, the bad and the ugly the grade is based on the column of max_devi_f by default, if the value is below lo, the level is good, if the value is above hi, the level is ugly, otherwise, the level is bad :param lo: the lower bound of good level :param hi: the upper bound of ugly level :param col: the column of model deviation to grade, default is max_devi_f """ if col not in self._md_df.columns: raise ValueError(f"Unknown model deviation column: {col}") for df, file in self._md_arr: good = df[col] < lo bad = (df[col] >= lo) & (df[col] <= hi) ugly = df[col] > hi self._stats[file] = { 'g': good.sum(), 'b': bad.sum(), 'u': ugly.sum(), 'all': len(df), } self._grade['good'] = self._md_df[col] < lo self._grade['bad'] = (self._md_df[col] >= lo) & (self._md_df[col] <= hi) self._grade['ugly'] = self._md_df[col] > hi return self
[docs] def dump_stats(self, out_file: str = '', fmt='tsv'): """ Dump the statistics of grading """ from tabulate import tabulate headers = ['file', 'total', 'good', 'bad', 'ugly', 'good%', 'bad%', 'ugly%'] table = [] overall = { 'all': len(self._md_df), 'g': self._grade['good'].sum(), 'b': self._grade['bad'].sum(), 'u': self._grade['ugly'].sum(), } for file, stats in [*self._stats.items(), ('', overall)]: total = stats['all'] g = stats['g'] b = stats['b'] u = stats['u'] g_pct = '{:.2%}'.format(g / total) b_pct = '{:.2%}'.format(b / total) u_pct = '{:.2%}'.format(u / total) table.append([file, total, g, b, u, g_pct, b_pct, u_pct]) stats_report = tabulate(table, headers=headers, tablefmt=fmt) if out_file: with open(out_file, 'w') as f: f.write(stats_report) else: logger.info(f'model deviation statistics:\n{stats_report}') return self
[docs] def to_ase(self, level): """ Hand over the atoms to ase tool :param level: the grade level to hand over, valid values are good, bad, ugly """ if level not in self._grade: raise ValueError(f"Unknown grade level: {level}") atoms_arr = [self._atoms_arr[i] for i in self._grade[level].index if self._grade[level][i]] from .ase import AseTool return AseTool(atoms_arr)