Skip to content

utils

Utils for easyfinemap.

get_significant_snps(df, pvalue_threshold=5e-08, use_most_sig_if_no_sig=True)

Get the significant snps from the input file, filter by pvalue.

Parameters:

Name Type Description Default
df DataFrame

The input summary statistics.

required
pvalue_threshold float

The pvalue threshold, by default 5e-8

5e-08
use_most_sig_if_no_sig bool

Whether to use the most significant SNP if no significant SNP found, by default True

True

Returns:

Type Description
DataFrame

The significant snps, sorted by pvalue.

Source code in easyfinemap/utils.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def get_significant_snps(df: pd.DataFrame, pvalue_threshold: float = 5e-8, use_most_sig_if_no_sig: bool = True):
    """
    Get the significant snps from the input file, filter by pvalue.

    Parameters
    ----------
    df : pd.DataFrame
        The input summary statistics.
    pvalue_threshold : float, optional
        The pvalue threshold, by default 5e-8
    use_most_sig_if_no_sig : bool, optional
        Whether to use the most significant SNP if no significant SNP found, by default True

    Returns
    -------
    pd.DataFrame
        The significant snps, sorted by pvalue.
    """
    sig_df = df.loc[df[ColName.P] < pvalue_threshold].copy()
    if sig_df.empty:
        if use_most_sig_if_no_sig:
            sig_df = df.loc[df[ColName.P] == df[ColName.P].min()].copy()
            logging.debug(f"Use the most significant SNP: {sig_df[ColName.SNPID].values[0]}")
            logging.debug(f"pvalue: {sig_df[ColName.P].values[0]}")
        else:
            raise ValueError("No significant SNPs found.")
    else:
        sig_df.sort_values(ColName.P, inplace=True)
        sig_df.reset_index(drop=True, inplace=True)
    return sig_df

io_in_tempdir(dir='./tmp')

Make tempdir for process.

Parameters:

Name Type Description Default
dir str

The tempdir, by default './tmp'

'./tmp'

Returns:

Type Description
decorator

The decorator of io in tempdir.

Source code in easyfinemap/utils.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def io_in_tempdir(dir='./tmp'):
    """
    Make tempdir for process.

    Parameters
    ----------
    dir : str, optional
        The tempdir, by default './tmp'

    Returns
    -------
    decorator
        The decorator of io in tempdir.
    """

    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            temp_dir = tempfile.mkdtemp(dir=dir)
            logger = logging.getLogger("IO")
            logger.debug(f"Tempdir: {temp_dir}")
            try:
                result = func(*args, temp_dir=temp_dir, **kwargs)
            except Exception:
                raise
            else:
                if logging.getLogger().getEffectiveLevel() >= logging.INFO:
                    shutil.rmtree(temp_dir)
                pass
            return result  # type: ignore

        return wrapper

    return decorator

make_SNPID_unique(sumstat, replace_rsIDcol=False, remove_duplicates=True)

Make the SNPID unique.

The unique SNPID is chr-bp-sorted(EA,NEA)

Parameters:

Name Type Description Default
sumstat DataFrame

The input summary statistics.

required
replace_rsIDcol bool

Whether to replace the rsID column with the unique SNPID, by default False

False
remove_duplicates bool

Whether to remove the duplicated SNPs, keep the one with smallest P-value, by default True

True

Returns:

Type Description
DataFrame

The summary statistics with unique SNPID.

Source code in easyfinemap/utils.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def make_SNPID_unique(sumstat: pd.DataFrame, replace_rsIDcol: bool = False, remove_duplicates: bool = True):
    """
    Make the SNPID unique.

    The unique SNPID is chr-bp-sorted(EA,NEA)

    Parameters
    ----------
    sumstat : pd.DataFrame
        The input summary statistics.
    replace_rsIDcol : bool, optional
        Whether to replace the rsID column with the unique SNPID, by default False
    remove_duplicates : bool, optional
        Whether to remove the duplicated SNPs, keep the one with smallest P-value, by default True

    Returns
    -------
    pd.DataFrame
        The summary statistics with unique SNPID.
    """
    df = sumstat.copy()
    allele_df = df[[ColName.EA, ColName.NEA]].copy()
    b = allele_df.values
    b.sort(axis=1)
    allele_df[[ColName.EA, ColName.NEA]] = b
    allele_df[ColName.SNPID] = (
        df[ColName.CHR].astype(str)
        + "-"
        + df[ColName.BP].astype(str)
        + "-"
        + allele_df[ColName.EA]
        + "-"
        + allele_df[ColName.NEA]
    )
    if replace_rsIDcol:
        df[ColName.RSID] = allele_df[ColName.SNPID]
    else:
        if ColName.SNPID in df.columns:
            df.drop(ColName.SNPID, axis=1, inplace=True)
        df.insert(loc=0, column=ColName.SNPID, value=allele_df[ColName.SNPID].values)  # type: ignore
    if remove_duplicates:
        df.sort_values(ColName.P, inplace=True)
        if replace_rsIDcol:
            df.drop_duplicates(subset=[ColName.RSID], keep="first", inplace=True)
        else:
            df.drop_duplicates(subset=[ColName.SNPID], keep="first", inplace=True)
        df.sort_values([ColName.CHR, ColName.BP], inplace=True)
        df.reset_index(drop=True, inplace=True)
    return df