Skip to content

FOCI module

FOCI

Class for computing FOCI.

Source code in xicorpy/foci.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
class FOCI:
    """Class for computing FOCI."""

    def __init__(self, y: npt.ArrayLike, x: npt.ArrayLike):
        """
        Initialize and validate the FOCI object.

        You can then use the `select_features` method to select features.

        Args:
            y (npt.ArrayLike): A single list or 1D array or a pandas Series.
            x (npt.ArrayLike): A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

        Raises:
            ValueError: If y is not 1d.
            ValueError: If x is not 1d or 2d.
            ValueError: If y and x have different lengths.
            ValueError: If there are <= 2 valid y values.
        """
        self.y_, self.x_df = validate_and_prepare_for_conditional_dependence(y, x)

    def _get_next_p(self, current_selection: List[Union[int, str]]):
        if current_selection:
            to_be_evaluated = [k for k in self.x_df if k not in current_selection]
            codec = compute_conditional_dependence_1d(
                self.y_, self.x_df[to_be_evaluated], self.x_df[current_selection]
            )
        else:
            codec = compute_conditional_dependence_1d(self.y_, self.x_df)

        if not codec:
            return 0, 0 # pragma: no cover
        next_p = max(codec, key=lambda k: codec[k])
        return next_p, codec[next_p]

    def select_features(
        self,
        num_features: int = None,
        init_selection: List[Union[int, str]] = None,
        get_conditional_dependency: bool = False,
    ) -> Union[List[Union[int, str]], Tuple[List[Union[int, str]], List[float]]]:
        """
        Selects features based on the Feature Ordering based on Conditional Independence (FOCI) algorithm in:
            [Azadkia and Chatterjee (2021). "A simple measure of conditional dependence", Annals of Statistics](https://arxiv.org/abs/1910.12327)

        Args:
            num_features: Maximum number of features to select. Defaults to the number of features in x.
            init_selection (list): Initial selection of features.
            get_conditional_dependency (bool): If True, returns conditional dependency. Defaults to False

        Returns:
            list: List of selected features.
                If x was `pd.DataFrame`, this will be column names.
                Otherwise, this will be indices.
            list: Conditional Dependency measure as each feature got selected
                Only when get_conditional_dependency is True

        """
        if num_features is None:  # pragma: no cover
            num_features = self.x_df.shape[1]

        current_selection = [i for i in (init_selection or [])]
        if len(current_selection) >= num_features:
            warnings.warn("Initial selection is already complete")
            if get_conditional_dependency:  # pragma: no cover
                return current_selection, []
            return current_selection

        codec = []
        stop = False
        while not stop and len(current_selection) < num_features:
            next_p, next_t = self._get_next_p(current_selection)

            if next_t <= 0:  # pragma: no cover
                stop = True
            else:
                current_selection.append(next_p)
                if get_conditional_dependency:
                    codec.append(
                        compute_conditional_dependence(
                            self.y_, self.x_df[current_selection]
                        )
                    )

        if get_conditional_dependency:
            return current_selection, codec

        return current_selection

__init__(y, x)

Initialize and validate the FOCI object.

You can then use the select_features method to select features.

Parameters:

Name Type Description Default
y npt.ArrayLike

A single list or 1D array or a pandas Series.

required
x npt.ArrayLike

A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

required

Raises:

Type Description
ValueError

If y is not 1d.

ValueError

If x is not 1d or 2d.

ValueError

If y and x have different lengths.

ValueError

If there are <= 2 valid y values.

Source code in xicorpy/foci.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def __init__(self, y: npt.ArrayLike, x: npt.ArrayLike):
    """
    Initialize and validate the FOCI object.

    You can then use the `select_features` method to select features.

    Args:
        y (npt.ArrayLike): A single list or 1D array or a pandas Series.
        x (npt.ArrayLike): A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

    Raises:
        ValueError: If y is not 1d.
        ValueError: If x is not 1d or 2d.
        ValueError: If y and x have different lengths.
        ValueError: If there are <= 2 valid y values.
    """
    self.y_, self.x_df = validate_and_prepare_for_conditional_dependence(y, x)

select_features(num_features=None, init_selection=None, get_conditional_dependency=False)

Selects features based on the Feature Ordering based on Conditional Independence (FOCI) algorithm in: Azadkia and Chatterjee (2021). "A simple measure of conditional dependence", Annals of Statistics

Parameters:

Name Type Description Default
num_features int

Maximum number of features to select. Defaults to the number of features in x.

None
init_selection list

Initial selection of features.

None
get_conditional_dependency bool

If True, returns conditional dependency. Defaults to False

False

Returns:

Name Type Description
list Union[List[Union[int, str]], Tuple[List[Union[int, str]], List[float]]]

List of selected features. If x was pd.DataFrame, this will be column names. Otherwise, this will be indices.

list Union[List[Union[int, str]], Tuple[List[Union[int, str]], List[float]]]

Conditional Dependency measure as each feature got selected Only when get_conditional_dependency is True

Source code in xicorpy/foci.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def select_features(
    self,
    num_features: int = None,
    init_selection: List[Union[int, str]] = None,
    get_conditional_dependency: bool = False,
) -> Union[List[Union[int, str]], Tuple[List[Union[int, str]], List[float]]]:
    """
    Selects features based on the Feature Ordering based on Conditional Independence (FOCI) algorithm in:
        [Azadkia and Chatterjee (2021). "A simple measure of conditional dependence", Annals of Statistics](https://arxiv.org/abs/1910.12327)

    Args:
        num_features: Maximum number of features to select. Defaults to the number of features in x.
        init_selection (list): Initial selection of features.
        get_conditional_dependency (bool): If True, returns conditional dependency. Defaults to False

    Returns:
        list: List of selected features.
            If x was `pd.DataFrame`, this will be column names.
            Otherwise, this will be indices.
        list: Conditional Dependency measure as each feature got selected
            Only when get_conditional_dependency is True

    """
    if num_features is None:  # pragma: no cover
        num_features = self.x_df.shape[1]

    current_selection = [i for i in (init_selection or [])]
    if len(current_selection) >= num_features:
        warnings.warn("Initial selection is already complete")
        if get_conditional_dependency:  # pragma: no cover
            return current_selection, []
        return current_selection

    codec = []
    stop = False
    while not stop and len(current_selection) < num_features:
        next_p, next_t = self._get_next_p(current_selection)

        if next_t <= 0:  # pragma: no cover
            stop = True
        else:
            current_selection.append(next_p)
            if get_conditional_dependency:
                codec.append(
                    compute_conditional_dependence(
                        self.y_, self.x_df[current_selection]
                    )
                )

    if get_conditional_dependency:
        return current_selection, codec

    return current_selection

select_features_using_foci(y, x, num_features=None, init_selection=None, get_conditional_dependency=False)

Implements the FOCI algorithm for feature selection.

Azadkia and Chatterjee (2021). "A simple measure of conditional dependence", Annals of Statistics. https://arxiv.org/abs/1910.12327.

Parameters:

Name Type Description Default
y npt.ArrayLike

The dependent variable. A single list or 1D array or a pandas Series.

required
x npt.ArrayLike

The independent variables. A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

required
num_features int

Max number of features to select. Defaults to ALL features.

None
init_selection list

Initial selection of features. If x is a pd.DataFrame, this is expected to be a list of column names. Otherwise, this is expected to be a list of indices.

None
get_conditional_dependency bool

If True, returns conditional dependency

False

Returns:

Name Type Description
list Union[List[Union[int, str]], Tuple[List[Union[int, str]], List[float]]]

List of selected features. If x was pd.DataFrame, this will be column names. Otherwise, this will be indices.

list Union[List[Union[int, str]], Tuple[List[Union[int, str]], List[float]]]

Conditional Dependency measure as each feature got selected Only when get_conditional_dependency is True

Raises:

Type Description
ValueError

If y is not 1d.

ValueError

If x is not 1d or 2d.

ValueError

If y and x have different lengths.

ValueError

If there are <= 2 valid y values.

Source code in xicorpy/foci.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def select_features_using_foci(
    y: npt.ArrayLike,
    x: npt.ArrayLike,
    num_features: int = None,
    init_selection: List[Union[int, str]] = None,
    get_conditional_dependency: bool = False,
) -> Union[List[Union[int, str]], Tuple[List[Union[int, str]], List[float]]]:
    """
    Implements the FOCI algorithm for feature selection.

    Azadkia and Chatterjee (2021). "A simple measure of conditional dependence", Annals of Statistics.
    https://arxiv.org/abs/1910.12327.

    Args:
        y (npt.ArrayLike): The dependent variable. A single list or 1D array or a pandas Series.
        x (npt.ArrayLike): The independent variables. A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.
        num_features: Max number of features to select. Defaults to ALL features.
        init_selection (list): Initial selection of features.
            If `x` is a `pd.DataFrame`, this is expected to be a list of column names.
            Otherwise, this is expected to be a list of indices.
        get_conditional_dependency (bool): If True, returns conditional dependency

    Returns:
        list: List of selected features.
            If x was `pd.DataFrame`, this will be column names.
            Otherwise, this will be indices.
        list: Conditional Dependency measure as each feature got selected
            Only when get_conditional_dependency is True

    Raises:
        ValueError: If y is not 1d.
        ValueError: If x is not 1d or 2d.
        ValueError: If y and x have different lengths.
        ValueError: If there are <= 2 valid y values.

    """
    return FOCI(y, x).select_features(
        num_features,
        init_selection,
        get_conditional_dependency=get_conditional_dependency,
    )