Source code for xenonpy.descriptor.structure

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

import re

import numpy as np
from pymatgen.core import Element
from pymatgen.analysis.local_env import VoronoiNN

from xenonpy.descriptor.base import BaseDescriptor, BaseFeaturizer

__all__ = ['RadialDistributionFunction', 'OrbitalFieldMatrix', 'Structures']


[docs]class RadialDistributionFunction(BaseFeaturizer): """ Calculate pair distribution descriptor for machine learning. """ @property def feature_labels(self): return [str(d) for d in self._interval[1:]] def __init__(self, n_bins=201, r_max=20.0, *, n_jobs=-1, on_errors='raise', return_type='any', target_col=None): """ Parameters ---------- n_bins: int Number of radial grid points. r_max: float Maximum of radial grid (the minimum is always set zero). n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """ super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col) assert n_bins >= 1, "n_bins should be greater than 1!" assert r_max > 0, "r_max should be greater than 0!" self.n_bins = n_bins self.r_max = r_max self.dr = r_max / (n_bins - 1) self._interval = np.arange(0.0, r_max + self.dr, self.dr) self.__authors__ = ['TsumiNa']
[docs] def featurize(self, structure): """ Get RDF of the input structure. Args: structure: Pymatgen Structure object. Returns: rdf, dist: (tuple of arrays) the first element is the normalized RDF, whereas the second element is the inner radius of the RDF bin. """ if not structure.is_ordered: raise ValueError("Disordered structure support not built yet") # Get the distances between all atoms neighbors_lst = structure.get_all_neighbors(self.r_max) all_distances = np.concatenate(tuple(map(lambda x: [e[1] for e in x], neighbors_lst))) # Compute a histogram dist_hist, dist_bins = np.histogram(all_distances, bins=self._interval, density=False) # Normalize counts shell_vol = 4.0 / 3.0 * np.pi * (np.power(dist_bins[1:], 3) - np.power(dist_bins[:-1], 3)) number_density = structure.num_sites / structure.volume return dist_hist / shell_vol / number_density
[docs]class OrbitalFieldMatrix(BaseFeaturizer): """ Representation based on the valence shell electrons of neighboring atoms. Each atom is described by a 32-element vector uniquely representing the valence subshell. A 32x32 (39x39) matrix is formed by multiplying two atomic vectors. An OFM for an atomic environment is the sum of these matrices for each atom the center atom coordinates with multiplied by a distance function (In this case, 1/r times the weight of the coordinating atom in the Voronoi. """ def __init__(self, including_d=True, *, n_jobs=-1, on_errors='raise', return_type='any', target_col=None): """ Parameters ---------- including_d: bool If true, add distance information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. return_type: str Specific the return type. Can be ``any``, ``array`` and ``df``. ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively. If ``any``, the return type dependent on the input type. Default is ``any`` target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """ super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col) self._including_d = including_d self.__authors__ = ['TsumiNa'] self.__citations__ = [ ''' @article{LamPham2017, archivePrefix = {arXiv}, arxivId = {1705.01043}, author = {{Lam Pham}, Tien and Kino, Hiori and Terakura, Kiyoyuki and Miyake, Takashi and Tsuda, Koji and Takigawa, Ichigaku and {Chi Dam}, Hieu}, doi = {10.1080/14686996.2017.1378060}, eprint = {1705.01043}, issn = {18785514}, journal = {Science and Technology of Advanced Materials}, keywords = {Material descriptor,data mining,machine learning,magnetic materials,material informatics}, number = {1}, pages = {756--765}, pmid = {29152012}, publisher = {Taylor {\&} Francis}, title = {{Machine learning reveals orbital interaction in materials}}, url = {https://doi.org/10.1080/14686996.2017.1378060}, volume = {18}, year = {2017} } ''' ]
[docs] @staticmethod def get_element_representation(name): """ generate one-hot representation for a element, e.g, si = [0.0, 1.0, 0.0, 0.0, ...] Parameters ---------- name: string element symbol """ element = Element(name) general_element_electronic = { 's1': 0.0, 's2': 0.0, 'p1': 0.0, 'p2': 0.0, 'p3': 0.0, 'p4': 0.0, 'p5': 0.0, 'p6': 0.0, 'd1': 0.0, 'd2': 0.0, 'd3': 0.0, 'd4': 0.0, 'd5': 0.0, 'd6': 0.0, 'd7': 0.0, 'd8': 0.0, 'd9': 0.0, 'd10': 0.0, 'f1': 0.0, 'f2': 0.0, 'f3': 0.0, 'f4': 0.0, 'f5': 0.0, 'f6': 0.0, 'f7': 0.0, 'f8': 0.0, 'f9': 0.0, 'f10': 0.0, 'f11': 0.0, 'f12': 0.0, 'f13': 0.0, 'f14': 0.0 } general_electron_subshells = [ 's1', 's2', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14' ] if name == 'H': element_electronic_structure = ['s1'] elif name == 'He': element_electronic_structure = ['s2'] else: element_electronic_structure = [ ''.join(pair) for pair in re.findall(r"\.\d(\w+)<sup>(\d+)</sup>", element.electronic_structure) ] for eletron_subshell in element_electronic_structure: general_element_electronic[eletron_subshell] = 1.0 return np.array([general_element_electronic[key] for key in general_electron_subshells])
[docs] def featurize(self, structure, is_including_d=True): """ Generate OFM descriptor Parameters ---------- structure: pymatgen.Structure The input structure for OFM calculation. """ atoms = np.array([site.species_string for site in structure]) coordinator_finder = VoronoiNN(cutoff=10.0) local_orbital_field_matrices = [] for i_atom, atom in enumerate(atoms): neighbors = coordinator_finder.get_nn_info(structure=structure, n=i_atom) site = structure[i_atom] center_vector = self.get_element_representation(atom) env_vector = np.zeros(32) for nn in neighbors: site_x = nn['site'] w = nn['weight'] site_x_label = site_x.species_string neigh_vector = self.get_element_representation(site_x_label) d = np.sqrt(np.sum((site.coords - site_x.coords)**2)) if self._including_d: env_vector += neigh_vector * w / d else: env_vector += neigh_vector * w local_matrix = center_vector[None, :] * env_vector[:, None] local_matrix = np.ravel(local_matrix) local_orbital_field_matrices.append(local_matrix) return np.array(local_orbital_field_matrices).mean(axis=0)
@property def feature_labels(self): labels = np.array([ 's1', 's2', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14' ]) return [i + '_' + j for i in labels for j in labels]
[docs]class Structures(BaseDescriptor): """ Calculate structure descriptors from compound's structure. """ def __init__(self, n_bins=201, r_max=20.0, including_d=True, *, n_jobs=-1, featurizers='all', on_errors='raise', target_col=None): """ Parameters ---------- n_bins: int Number of radial grid points. r_max: float Maximum of radial grid (the minimum is always set zero). including_d: bool If true, add distance information. n_jobs: int The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default). featurizers: list[str] or 'all' Featurizers that will be used. Default is 'all'. on_errors: string How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'. When 'nan', return a column with ``np.nan``. The length of column corresponding to the number of feature labs. When 'keep', return a column with exception objects. The default is 'raise' which will raise up the exception. target_col Only relevant when input is pd.DataFrame, otherwise ignored. Specify a single column to be used for transformation. If ``None``, all columns of the pd.DataFrame is used. Default is None. """ super().__init__(featurizers=featurizers) self.n_jobs = n_jobs self.structure = RadialDistributionFunction(n_bins, r_max, n_jobs=n_jobs, on_errors=on_errors, target_col=target_col) self.structure = OrbitalFieldMatrix(including_d, n_jobs=n_jobs, on_errors=on_errors, target_col=target_col)