Source code for xenonpy.descriptor.structure

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

import re

import numpy as np
from pymatgen.core import Element
from pymatgen.analysis.local_env import VoronoiNN

from xenonpy.descriptor.base import BaseDescriptor, BaseFeaturizer

__all__ = ['RadialDistributionFunction', 'OrbitalFieldMatrix', 'Structures']


[docs]class RadialDistributionFunction(BaseFeaturizer):
    """
    Calculate pair distribution descriptor for machine learning.

    """

    @property
    def feature_labels(self):
        return [str(d) for d in self._interval[1:]]

    def __init__(self, n_bins=201, r_max=20.0, *, n_jobs=-1, on_errors='raise', return_type='any', target_col=None):
        """
        
        Parameters
        ----------
        n_bins: int
            Number of radial grid points.
        r_max: float
            Maximum of radial grid (the minimum is always set zero).
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default).
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        return_type: str
            Specific the return type.
            Can be ``any``, ``array`` and ``df``.
            ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
            If ``any``, the return type dependent on the input type.
            Default is ``any``
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """

        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        assert n_bins >= 1, "n_bins should be greater than 1!"
        assert r_max > 0, "r_max should be greater than 0!"

        self.n_bins = n_bins
        self.r_max = r_max
        self.dr = r_max / (n_bins - 1)
        self._interval = np.arange(0.0, r_max + self.dr, self.dr)
        self.__authors__ = ['TsumiNa']

[docs]    def featurize(self, structure):
        """
        Get RDF of the input structure.
        Args:
            structure: Pymatgen Structure object.
        Returns:
            rdf, dist: (tuple of arrays) the first element is the
                    normalized RDF, whereas the second element is
                    the inner radius of the RDF bin.
        """
        if not structure.is_ordered:
            raise ValueError("Disordered structure support not built yet")

        # Get the distances between all atoms
        neighbors_lst = structure.get_all_neighbors(self.r_max)
        all_distances = np.concatenate(tuple(map(lambda x: [e[1] for e in x], neighbors_lst)))

        # Compute a histogram
        dist_hist, dist_bins = np.histogram(all_distances, bins=self._interval, density=False)

        # Normalize counts
        shell_vol = 4.0 / 3.0 * np.pi * (np.power(dist_bins[1:], 3) - np.power(dist_bins[:-1], 3))
        number_density = structure.num_sites / structure.volume
        return dist_hist / shell_vol / number_density


[docs]class OrbitalFieldMatrix(BaseFeaturizer):
    """
    Representation based on the valence shell electrons of neighboring atoms.

    Each atom is described by a 32-element vector uniquely representing the
    valence subshell. A 32x32 (39x39) matrix is formed by multiplying two
    atomic vectors. An OFM for an atomic environment is the sum of these
    matrices for each atom the center atom coordinates with multiplied by a
    distance function (In this case, 1/r times the weight of the coordinating
    atom in the Voronoi.

    """

    def __init__(self, including_d=True, *, n_jobs=-1, on_errors='raise', return_type='any', target_col=None):
        """

        Parameters
        ----------
        including_d: bool
            If true, add distance information.
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default).
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        return_type: str
            Specific the return type.
            Can be ``any``, ``array`` and ``df``.
            ``array`` and ``df`` force return type to ``np.ndarray`` and ``pd.DataFrame`` respectively.
            If ``any``, the return type dependent on the input type.
            Default is ``any``
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
        self._including_d = including_d
        self.__authors__ = ['TsumiNa']
        self.__citations__ = [
            '''
            @article{LamPham2017,
                archivePrefix = {arXiv},
                arxivId = {1705.01043},
                author = {{Lam Pham}, Tien and Kino, Hiori and Terakura, Kiyoyuki and Miyake, Takashi and Tsuda, Koji and Takigawa, Ichigaku and {Chi Dam}, Hieu},
                doi = {10.1080/14686996.2017.1378060},
                eprint = {1705.01043},
                issn = {18785514},
                journal = {Science and Technology of Advanced Materials},
                keywords = {Material descriptor,data mining,machine learning,magnetic materials,material informatics},
                number = {1},
                pages = {756--765},
                pmid = {29152012},
                publisher = {Taylor {\&} Francis},
                title = {{Machine learning reveals orbital interaction in materials}},
                url = {https://doi.org/10.1080/14686996.2017.1378060},
                volume = {18},
                year = {2017}
                }
            '''
        ]

[docs]    @staticmethod
    def get_element_representation(name):
        """
        generate one-hot representation for a element, e.g, si = [0.0, 1.0, 0.0, 0.0, ...]

        Parameters
        ----------
        name: string
            element symbol
        """
        element = Element(name)
        general_element_electronic = {
            's1': 0.0,
            's2': 0.0,
            'p1': 0.0,
            'p2': 0.0,
            'p3': 0.0,
            'p4': 0.0,
            'p5': 0.0,
            'p6': 0.0,
            'd1': 0.0,
            'd2': 0.0,
            'd3': 0.0,
            'd4': 0.0,
            'd5': 0.0,
            'd6': 0.0,
            'd7': 0.0,
            'd8': 0.0,
            'd9': 0.0,
            'd10': 0.0,
            'f1': 0.0,
            'f2': 0.0,
            'f3': 0.0,
            'f4': 0.0,
            'f5': 0.0,
            'f6': 0.0,
            'f7': 0.0,
            'f8': 0.0,
            'f9': 0.0,
            'f10': 0.0,
            'f11': 0.0,
            'f12': 0.0,
            'f13': 0.0,
            'f14': 0.0
        }

        general_electron_subshells = [
            's1', 's2', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10',
            'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14'
        ]

        if name == 'H':
            element_electronic_structure = ['s1']
        elif name == 'He':
            element_electronic_structure = ['s2']
        else:
            element_electronic_structure = [
                ''.join(pair) for pair in re.findall(r"\.\d(\w+)<sup>(\d+)</sup>", element.electronic_structure)
            ]
        for eletron_subshell in element_electronic_structure:
            general_element_electronic[eletron_subshell] = 1.0

        return np.array([general_element_electronic[key] for key in general_electron_subshells])

[docs]    def featurize(self, structure, is_including_d=True):
        """
        Generate OFM descriptor

        Parameters
        ----------
        structure: pymatgen.Structure
            The input structure for OFM calculation.
        """

        atoms = np.array([site.species_string for site in structure])
        coordinator_finder = VoronoiNN(cutoff=10.0)

        local_orbital_field_matrices = []
        for i_atom, atom in enumerate(atoms):
            neighbors = coordinator_finder.get_nn_info(structure=structure, n=i_atom)

            site = structure[i_atom]
            center_vector = self.get_element_representation(atom)
            env_vector = np.zeros(32)

            for nn in neighbors:
                site_x = nn['site']
                w = nn['weight']
                site_x_label = site_x.species_string
                neigh_vector = self.get_element_representation(site_x_label)
                d = np.sqrt(np.sum((site.coords - site_x.coords)**2))
                if self._including_d:
                    env_vector += neigh_vector * w / d
                else:
                    env_vector += neigh_vector * w

            local_matrix = center_vector[None, :] * env_vector[:, None]
            local_matrix = np.ravel(local_matrix)
            local_orbital_field_matrices.append(local_matrix)

        return np.array(local_orbital_field_matrices).mean(axis=0)

    @property
    def feature_labels(self):
        labels = np.array([
            's1', 's2', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10',
            'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14'
        ])

        return [i + '_' + j for i in labels for j in labels]


[docs]class Structures(BaseDescriptor):
    """
    Calculate structure descriptors from compound's structure.
    """

    def __init__(self,
                 n_bins=201,
                 r_max=20.0,
                 including_d=True,
                 *,
                 n_jobs=-1,
                 featurizers='all',
                 on_errors='raise',
                 target_col=None):
        """

        Parameters
        ----------
        n_bins: int
            Number of radial grid points.
        r_max: float
            Maximum of radial grid (the minimum is always set zero).
        including_d: bool
            If true, add distance information.
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default).
        featurizers: list[str] or 'all'
            Featurizers that will be used.
            Default is 'all'.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        """
        super().__init__(featurizers=featurizers)
        self.n_jobs = n_jobs

        self.structure = RadialDistributionFunction(n_bins,
                                                    r_max,
                                                    n_jobs=n_jobs,
                                                    on_errors=on_errors,
                                                    target_col=target_col)
        self.structure = OrbitalFieldMatrix(including_d, n_jobs=n_jobs, on_errors=on_errors, target_col=target_col)