Skip to content

Workflow module

XiCorrelation

Class containing Xi Correlation computation components

Source code in xicorpy/correlation.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
class XiCorrelation:
    """Class containing Xi Correlation computation components"""

    def __init__(self, x: npt.ArrayLike, y: npt.ArrayLike = None):
        """
        If only `x` is passed, computes correlation between each column of `x`.
        If `y` is also passed, computes correlation between each column of `x` vs each column of `y`.

        If only `x` is passed, `x` MUST be 2-d. Otherwise, both `x` and `y` can be 1-d

        Args:
            x (npt.ArrayLike): A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.
            y (npt.ArrayLike): A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

        Raises:
            ValueError: If x and y are not of the same shape.
            ValueError: If there's less than 2 columns to compute correlation.

        """
        if not (1 <= np.ndim(x) <= 2 and np.shape(x)[0] >= 1):
            raise ValueError("x must be a 1D/2D array/list")

        x_df = pd.DataFrame(x)
        x_shape = np.shape(x)

        if y is not None:
            if not (1 <= np.ndim(y) <= 2 and np.shape(y)[0] >= 1):
                raise ValueError("y must be a 1D/2D array/list")
            y_shape = np.shape(y)
            if x_shape[0] != y_shape[0]:
                raise ValueError(
                    f"x: {x_shape[0]} samples, y: {y_shape[0]} samples. "
                    f"x and y MUST HAVE the same number of samples"
                )
            y_df = pd.DataFrame(y)
        else:
            if not (np.ndim(x) == 2 and np.shape(x)[0] >= 2 and np.shape(x)[1] >= 2):
                raise ValueError("x must be 2D if y is not provided")
            y_df = pd.DataFrame(x)

        self.x_df = convert_to_numeric(x_df)
        self.y_df = convert_to_numeric(y_df)

        self._x = x
        self._y = y

    def compute_xi(
        self,
        get_modified_xi: bool = None,
        m_nearest_neighbours: int = None,
        get_p_values: bool = False,
    ) -> Union[_RetType, Tuple[_RetType, _RetType]]:
        """
        Compute the Xi Coefficient (Chatterjee's Rank Correlation) between columns in X and Y.

        Xi Coefficient based on:
            [Chatterjee (2020). "A new coefficient of correlation"](https://arxiv.org/abs/1909.10140)


        Modified Xi Coefficient based on:
            [Lin and Han (2021). "On boosting the power of Chatterjee's rank correlation"](https://arxiv.org/abs/2108.06828)

        The modified Xi Coefficient looks at M nearest neighbours to compute the correlation.
        This allows the coefficient to converge much faster. However, it is computationally slightly more intensive.
        For very large data, the two are likely to be very similar. We recommend using the modified Xi Coefficient.

        Args:
            get_modified_xi: Should the modified xi be computed?
                    Defaults to True when there are no ties, and False when ties are present.
            m_nearest_neighbours: Only used when get_modified_xi is True.
                    Defaults to square-root of array size.
            get_p_values: Should the p-values be computed?
                    The null hypothesis is that Y is completely independent of X (i.e., xi = 0).

        Returns:
            float/np.ndarray/pd.DataFrame:
            - Xi Coefficient Values.
                - If both X and Y are 1-d, returns a single float.
                - If X is numpy object, returns a 2-D numpy array.
                - Otherwise returns a pd.DataFrame.
            - P-Values (only when get_p_values are true):
                - Same format at Xi

        """
        if get_modified_xi is False:
            pass
        else:
            ties = _check_ties(self.x_df, self.y_df)
            if ties:
                if get_modified_xi is True:
                    warnings.warn(
                        "Cannot use modified xi when there are ties present. Either explicitly set"
                        "`get_modified_xi=False` or leave as `None` to accept automatic decision.",
                        RuntimeWarning,
                    )
                else:
                    get_modified_xi = False
            elif get_modified_xi is None:
                get_modified_xi = True

        ret = pd.DataFrame(0.0, index=self.x_df.columns, columns=self.y_df.columns)
        _, p = _get_p_no_ties(0, self.x_df.shape[0])
        p_values = pd.DataFrame(p, index=self.x_df.columns, columns=self.y_df.columns)

        for i in self.x_df.columns:
            i_col: pd.Series = self.x_df[i]
            if i_col.min() == i_col.max():  # pragma: no cover
                # Constant column. Correlation will anyway be 0.
                ret.loc[i] = 0
            else:
                if i_col.hasnans:
                    i_col = i_col.dropna()

                if i_col.shape[0] <= 2:  # pragma: no cover
                    # Not enough samples to compute correlation.
                    ret.loc[i] = 0
                else:
                    # Sort once to avoid sorting each time we compute correlation.
                    i_col = i_col.sort_values(kind="stable")
                    for j in self.y_df.columns:
                        j_col: pd.Series = self.y_df.loc[i_col.index, j]
                        if j_col.hasnans:
                            j_col = j_col.dropna()

                        if get_p_values:
                            xi, sd, p = _single_pair(  # type: ignore
                                i_col,
                                j_col,
                                get_modified_xi,
                                m_nearest_neighbours,
                                True,
                            )
                            ret.loc[i, j] = xi
                            p_values.loc[i, j] = p
                        else:
                            ret.loc[i, j] = _single_pair(
                                i_col,
                                j_col,
                                get_modified_xi,
                                m_nearest_neighbours,
                                False,
                            )

        if (
            isinstance(self._x, list)
            and isinstance(self._y, list)
            and np.ndim(self._x) == 1
            and np.ndim(self._y) == 1
        ):
            if get_p_values:
                return ret.values[0, 0], p_values.values[0, 0]
            else:
                return ret.values[0, 0]

        if isinstance(self._x, np.ndarray):
            ret = ret.values
            p_values = p_values.values

        if get_p_values:
            return ret, p_values

        return ret

__init__(x, y=None)

If only x is passed, computes correlation between each column of x. If y is also passed, computes correlation between each column of x vs each column of y.

If only x is passed, x MUST be 2-d. Otherwise, both x and y can be 1-d

Parameters:

Name Type Description Default
x npt.ArrayLike

A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

required
y npt.ArrayLike

A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

None

Raises:

Type Description
ValueError

If x and y are not of the same shape.

ValueError

If there's less than 2 columns to compute correlation.

Source code in xicorpy/correlation.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(self, x: npt.ArrayLike, y: npt.ArrayLike = None):
    """
    If only `x` is passed, computes correlation between each column of `x`.
    If `y` is also passed, computes correlation between each column of `x` vs each column of `y`.

    If only `x` is passed, `x` MUST be 2-d. Otherwise, both `x` and `y` can be 1-d

    Args:
        x (npt.ArrayLike): A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.
        y (npt.ArrayLike): A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

    Raises:
        ValueError: If x and y are not of the same shape.
        ValueError: If there's less than 2 columns to compute correlation.

    """
    if not (1 <= np.ndim(x) <= 2 and np.shape(x)[0] >= 1):
        raise ValueError("x must be a 1D/2D array/list")

    x_df = pd.DataFrame(x)
    x_shape = np.shape(x)

    if y is not None:
        if not (1 <= np.ndim(y) <= 2 and np.shape(y)[0] >= 1):
            raise ValueError("y must be a 1D/2D array/list")
        y_shape = np.shape(y)
        if x_shape[0] != y_shape[0]:
            raise ValueError(
                f"x: {x_shape[0]} samples, y: {y_shape[0]} samples. "
                f"x and y MUST HAVE the same number of samples"
            )
        y_df = pd.DataFrame(y)
    else:
        if not (np.ndim(x) == 2 and np.shape(x)[0] >= 2 and np.shape(x)[1] >= 2):
            raise ValueError("x must be 2D if y is not provided")
        y_df = pd.DataFrame(x)

    self.x_df = convert_to_numeric(x_df)
    self.y_df = convert_to_numeric(y_df)

    self._x = x
    self._y = y

compute_xi(get_modified_xi=None, m_nearest_neighbours=None, get_p_values=False)

Compute the Xi Coefficient (Chatterjee's Rank Correlation) between columns in X and Y.

Xi Coefficient based on

Chatterjee (2020). "A new coefficient of correlation"

Modified Xi Coefficient based on

Lin and Han (2021). "On boosting the power of Chatterjee's rank correlation"

The modified Xi Coefficient looks at M nearest neighbours to compute the correlation. This allows the coefficient to converge much faster. However, it is computationally slightly more intensive. For very large data, the two are likely to be very similar. We recommend using the modified Xi Coefficient.

Parameters:

Name Type Description Default
get_modified_xi bool

Should the modified xi be computed? Defaults to True when there are no ties, and False when ties are present.

None
m_nearest_neighbours int

Only used when get_modified_xi is True. Defaults to square-root of array size.

None
get_p_values bool

Should the p-values be computed? The null hypothesis is that Y is completely independent of X (i.e., xi = 0).

False

Returns:

Type Description
Union[_RetType, Tuple[_RetType, _RetType]]

float/np.ndarray/pd.DataFrame:

Union[_RetType, Tuple[_RetType, _RetType]]
  • Xi Coefficient Values.
  • If both X and Y are 1-d, returns a single float.
  • If X is numpy object, returns a 2-D numpy array.
  • Otherwise returns a pd.DataFrame.
Union[_RetType, Tuple[_RetType, _RetType]]
  • P-Values (only when get_p_values are true):
  • Same format at Xi
Source code in xicorpy/correlation.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def compute_xi(
    self,
    get_modified_xi: bool = None,
    m_nearest_neighbours: int = None,
    get_p_values: bool = False,
) -> Union[_RetType, Tuple[_RetType, _RetType]]:
    """
    Compute the Xi Coefficient (Chatterjee's Rank Correlation) between columns in X and Y.

    Xi Coefficient based on:
        [Chatterjee (2020). "A new coefficient of correlation"](https://arxiv.org/abs/1909.10140)


    Modified Xi Coefficient based on:
        [Lin and Han (2021). "On boosting the power of Chatterjee's rank correlation"](https://arxiv.org/abs/2108.06828)

    The modified Xi Coefficient looks at M nearest neighbours to compute the correlation.
    This allows the coefficient to converge much faster. However, it is computationally slightly more intensive.
    For very large data, the two are likely to be very similar. We recommend using the modified Xi Coefficient.

    Args:
        get_modified_xi: Should the modified xi be computed?
                Defaults to True when there are no ties, and False when ties are present.
        m_nearest_neighbours: Only used when get_modified_xi is True.
                Defaults to square-root of array size.
        get_p_values: Should the p-values be computed?
                The null hypothesis is that Y is completely independent of X (i.e., xi = 0).

    Returns:
        float/np.ndarray/pd.DataFrame:
        - Xi Coefficient Values.
            - If both X and Y are 1-d, returns a single float.
            - If X is numpy object, returns a 2-D numpy array.
            - Otherwise returns a pd.DataFrame.
        - P-Values (only when get_p_values are true):
            - Same format at Xi

    """
    if get_modified_xi is False:
        pass
    else:
        ties = _check_ties(self.x_df, self.y_df)
        if ties:
            if get_modified_xi is True:
                warnings.warn(
                    "Cannot use modified xi when there are ties present. Either explicitly set"
                    "`get_modified_xi=False` or leave as `None` to accept automatic decision.",
                    RuntimeWarning,
                )
            else:
                get_modified_xi = False
        elif get_modified_xi is None:
            get_modified_xi = True

    ret = pd.DataFrame(0.0, index=self.x_df.columns, columns=self.y_df.columns)
    _, p = _get_p_no_ties(0, self.x_df.shape[0])
    p_values = pd.DataFrame(p, index=self.x_df.columns, columns=self.y_df.columns)

    for i in self.x_df.columns:
        i_col: pd.Series = self.x_df[i]
        if i_col.min() == i_col.max():  # pragma: no cover
            # Constant column. Correlation will anyway be 0.
            ret.loc[i] = 0
        else:
            if i_col.hasnans:
                i_col = i_col.dropna()

            if i_col.shape[0] <= 2:  # pragma: no cover
                # Not enough samples to compute correlation.
                ret.loc[i] = 0
            else:
                # Sort once to avoid sorting each time we compute correlation.
                i_col = i_col.sort_values(kind="stable")
                for j in self.y_df.columns:
                    j_col: pd.Series = self.y_df.loc[i_col.index, j]
                    if j_col.hasnans:
                        j_col = j_col.dropna()

                    if get_p_values:
                        xi, sd, p = _single_pair(  # type: ignore
                            i_col,
                            j_col,
                            get_modified_xi,
                            m_nearest_neighbours,
                            True,
                        )
                        ret.loc[i, j] = xi
                        p_values.loc[i, j] = p
                    else:
                        ret.loc[i, j] = _single_pair(
                            i_col,
                            j_col,
                            get_modified_xi,
                            m_nearest_neighbours,
                            False,
                        )

    if (
        isinstance(self._x, list)
        and isinstance(self._y, list)
        and np.ndim(self._x) == 1
        and np.ndim(self._y) == 1
    ):
        if get_p_values:
            return ret.values[0, 0], p_values.values[0, 0]
        else:
            return ret.values[0, 0]

    if isinstance(self._x, np.ndarray):
        ret = ret.values
        p_values = p_values.values

    if get_p_values:
        return ret, p_values

    return ret

compute_xi_correlation(x, y=None, get_modified_xi=None, m_nearest_neighbours=None, get_p_values=False)

Helper function to compute the Xi Coefficient - uses the class machinery from XiCorrelation.

Compute the Xi Coefficient (Chatterjee's Rank Correlation) between columns in X and Y.

Xi Coefficient based on

Chatterjee (2020). "A new coefficient of correlation"

Modified Xi Coefficient based on

Lin and Han (2021). "On boosting the power of Chatterjee's rank correlation"

The modified Xi Coefficient looks at M nearest neighbours to compute the correlation. This allows the coefficient to converge much faster. However, it is computationally slightly more intensive. For very large data, the two are likely to be very similar. We recommend using the modified Xi Coefficient.

If only X is passed, computes correlation between each column of X. If Y is also passed, computes correlation between each column of X vs each column of Y.

If only X is passed, X MUST be 2-d. Otherwise, both X and Y can be 1-d

Parameters:

Name Type Description Default
x npt.ArrayLike

A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

required
y npt.ArrayLike

A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.

None
get_modified_xi bool

Should the modified xi be computed? By default this is True when there are no ties and False when ties are present

None
m_nearest_neighbours int

Only used if get_modified_xi is True.

None
get_p_values bool

Should the p-values be computed? The null hypothesis is that Y is completely independent of X (i.e., xi = 0).

False

Returns:

Type Description
Union[_RetType, Tuple[_RetType, _RetType]]

float/np.ndarray/pd.DataFrame:

Union[_RetType, Tuple[_RetType, _RetType]]
  • Xi Coefficient Values.
  • If both X and Y are 1-d, returns a single float.
  • If X is numpy object, returns a 2-D numpy array.
  • Otherwise returns a pd.DataFrame.
Union[_RetType, Tuple[_RetType, _RetType]]
  • P-Values (only if get_p_values are true):
  • Same format at Xi
Source code in xicorpy/correlation.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def compute_xi_correlation(
    x: npt.ArrayLike,
    y: npt.ArrayLike = None,
    get_modified_xi: bool = None,
    m_nearest_neighbours: int = None,
    get_p_values: bool = False,
) -> Union[_RetType, Tuple[_RetType, _RetType]]:
    """
    Helper function to compute the Xi Coefficient - uses the class machinery from `XiCorrelation`.

    Compute the Xi Coefficient (Chatterjee's Rank Correlation) between columns in X and Y.

    Xi Coefficient based on:
        [Chatterjee (2020). "A new coefficient of correlation"](https://arxiv.org/abs/1909.10140)


    Modified Xi Coefficient based on:
        [Lin and Han (2021). "On boosting the power of Chatterjee's rank correlation"](https://arxiv.org/abs/2108.06828)

    The modified Xi Coefficient looks at M nearest neighbours to compute the correlation.
    This allows the coefficient to converge much faster. However, it is computationally slightly more intensive.
    For very large data, the two are likely to be very similar. We recommend using the modified Xi Coefficient.

    If only X is passed, computes correlation between each column of X.
    If Y is also passed, computes correlation between each column of X vs each column of Y.

    If only X is passed, X MUST be 2-d. Otherwise, both X and Y can be 1-d

    Args:
        x (npt.ArrayLike): A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.
        y (npt.ArrayLike): A single list or list of lists or 1D/2D numpy array or pd.Series or pd.DataFrame.
        get_modified_xi: Should the modified xi be computed? By default this is True when there are no ties and False when ties are present
        m_nearest_neighbours: Only used if get_modified_xi is True.
        get_p_values: Should the p-values be computed?
                        The null hypothesis is that Y is completely independent of X (i.e., xi = 0).

    Returns:
        float/np.ndarray/pd.DataFrame:
        - Xi Coefficient Values.
            - If both X and Y are 1-d, returns a single float.
            - If X is numpy object, returns a 2-D numpy array.
            - Otherwise returns a pd.DataFrame.
        - P-Values (only if get_p_values are true):
            - Same format at Xi


    """
    return XiCorrelation(x, y).compute_xi(
        get_modified_xi, m_nearest_neighbours, get_p_values
    )