LDRef

Prepare LD reference for easyfinemap.

Source code in easyfinemap/ldref.py

def __init__(self):
    """Initialize the LDRef class."""
    self.logger = logging.getLogger("LDRef")
    self.plink = Tools().plink
    self.gcta = Tools().gcta
    self.tmp_root = Path.cwd() / "tmp" / "ldref"
    if not self.tmp_root.exists():
        self.tmp_root.mkdir(parents=True)

`annotate_r2(sumstat, ldref, ld_snp, temp_dir=None)` ¶

Annotate SNPs with r2 to the lead SNP.

Parameters:

Name	Type	Description	Default
`sumstat`	`DataFrame`	The summary statistics.	required
`ldref`	`str`	The path to the LD reference file.	required
`ld_snp`	`str`	The lead SNP.	required
`temp_dir`	`Optional[str]`	The path to the temporary directory, by default None	`None`

Returns:

Type	Description
`DataFrame`	The annotated summary statistics.

Source code in easyfinemap/ldref.py

@io_in_tempdir('./tmp/ldref')
def annotate_r2(
    self,
    sumstat: pd.DataFrame,
    ldref: str,
    ld_snp: str,
    temp_dir: Optional[str] = None,
) -> pd.DataFrame:
    """
    Annotate SNPs with r2 to the lead SNP.

    Parameters
    ----------
    sumstat : pd.DataFrame
        The summary statistics.
    ldref : str
        The path to the LD reference file.
    ld_snp : str
        The lead SNP.
    temp_dir : Optional[str], optional
        The path to the temporary directory, by default None

    Returns
    -------
    pd.DataFrame
        The annotated summary statistics.
    """
    if len(sumstat[ColName.CHR].unique()) > 1:
        raise ValueError("Only one chromosome is allowed.")
    chrom = sumstat[ColName.CHR].iloc[0]
    if len(sumstat) > 100000:
        self.logger.warning(
            "The sumstats is large, it may take a long time to annotate the r2."
        )
    ld = LDRef()
    r2_df = sumstat.copy()
    r2_input = ld.intersect(sumstat, ldref.format(chrom=chrom), f"{temp_dir}/r2_input_{chrom}")
    if ld_snp not in r2_input[ColName.SNPID].tolist():
        raise ValueError(f"{ld_snp} not in the LD reference.")
    cmd = [
        self.plink,
        "--bfile",
        f"{temp_dir}/r2_input_{chrom}",
        "--r2",
        "--ld-snp",
        ld_snp,
        "--ld-window-kb",
        "100000",
        "--ld-window",
        "99999999",
        "--ld-window-r2",
        "0",
        "--keep-allele-order",
        "--out",
        f"{temp_dir}/r2_{chrom}",
    ]
    self.logger.debug(f"annotate r2: {' '.join(cmd)}")
    res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
    if res.returncode != 0:
        self.logger.error(res.stderr)
        raise RuntimeError(res.stderr)
    else:
        res_r2 = pd.read_csv(f"{temp_dir}/r2_{chrom}.ld", delim_whitespace=True)
        res_r2 = pd.Series(res_r2["R2"].values, index=res_r2["SNP_B"].values)
        r2_df["R2"] = r2_df[ColName.SNPID].map(res_r2)
        r2_df.loc[r2_df[ColName.SNPID] == ld_snp, "R2"] = 1
        r2_df['R2'] = r2_df['R2'].fillna(-1)
        return r2_df

`cojo_cond(sumstats, cond_snps, ldref, sample_size, use_ref_EAF=False, temp_dir=None)` ¶

Conditional analysis. Update the beta, se, pval of the conditional SNPs.

Parameters:

Name	Type	Description	Default
`sumstats`	`DataFrame`	The summary statistics.	required
`cond_snps`	`DataFrame`	The conditional SNPs.	required
`ldref`	`str`	The path to the LD reference file.	required
`sample_size`	`int`	The sample size.	required
`use_ref_EAF`	`bool`	Whether to use the EAF in the LD reference file, by default False	`False`
`temp_dir`	`Optional[str]`	The path to the temporary directory, by default None	`None`

Raises:

Type	Description
`ValueError`	If the EAF is not in the sumstats and use_ref_EAF is False.

Returns:

Type	Description
`DataFrame`	The updated summary statistics.

Source code in easyfinemap/ldref.py

@io_in_tempdir('./tmp/ldref')
def cojo_cond(
    self,
    sumstats: pd.DataFrame,
    cond_snps: pd.DataFrame,
    ldref: str,
    sample_size: int,
    use_ref_EAF: bool = False,
    temp_dir: Optional[str] = None,
) -> pd.DataFrame:
    """
    Conditional analysis. Update the beta, se, pval of the conditional SNPs.

    Parameters
    ----------
    sumstats : pd.DataFrame
        The summary statistics.
    cond_snps : pd.DataFrame
        The conditional SNPs.
    ldref : str
        The path to the LD reference file.
    sample_size : int
        The sample size.
    use_ref_EAF : bool, optional
        Whether to use the EAF in the LD reference file, by default False
    temp_dir : Optional[str], optional
        The path to the temporary directory, by default None

    Raises
    ------
    ValueError
        If the EAF is not in the sumstats and use_ref_EAF is False.

    Returns
    -------
    pd.DataFrame
        The updated summary statistics.
    """
    if not use_ref_EAF and ColName.EAF not in sumstats.columns:
        raise ValueError(
            f"{ColName.EAF} is not in the sumstats, please set use_ref_EAF to True"
        )
    chrom = sumstats[ColName.CHR].iloc[0]
    # ld = LDRef()
    # all_sumstats = pd.concat([sumstats, cond_snps], ignore_index=True)
    # all_sumstats.drop_duplicates(subset=[ColName.SNPID], inplace=True)
    # all_sumstats.sort_values(by=[ColName.CHR, ColName.BP], inplace=True)
    # all_sumstats.reset_index(drop=True, inplace=True)
    # cojo_input = ld.intersect(all_sumstats, ldref, f"{temp_dir}/cojo_input_{chrom}", use_ref_EAF)
    cojo_input = sumstats.copy()
    cojo_input[ColName.N] = sample_size
    cojo_input = cojo_input[
        [
            ColName.SNPID,
            ColName.EA,
            ColName.NEA,
            ColName.EAF,
            ColName.BETA,
            ColName.SE,
            ColName.P,
            ColName.N,
        ]
    ]
    cojo_input.rename(
        columns={
            ColName.SNPID: "SNP",
            ColName.EA: "A1",
            ColName.NEA: "A2",
            ColName.EAF: "freq",
            ColName.BETA: "b",
            ColName.SE: "se",
            ColName.P: "p",
            ColName.N: "N",
        },
        inplace=True,
    )
    cojo_p_file = f"{temp_dir}/cojo_input_{chrom}.ma"
    cojo_input.to_csv(cojo_p_file, sep=" ", index=False)
    with open(f"{temp_dir}/cojo_cond_{chrom}.snps", "w") as f:
        f.write('\n'.join(cond_snps[ColName.SNPID].tolist()))
    cojo_outfile = f"{temp_dir}/cojo_{chrom}.cond"
    cmd = [
        self.gcta,
        "--bfile",
        ldref,
        "--cojo-file",
        cojo_p_file,
        "--diff-freq",
        "1",
        "--cojo-collinear",
        "0.99",
        "--cojo-cond",
        f"{temp_dir}/cojo_cond_{chrom}.snps",
        "--out",
        cojo_outfile,
    ]
    self.logger.debug(f"conditional analysis: {' '.join(cmd)}")
    res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
    if res.returncode != 0:
        except_error = 'Error: there is a collinearity problem of the given list of SNPs.'
        if except_error in res.stdout and os.path.exists(
            f"{temp_dir}/cojo_{chrom}.cond.given.cojo"
        ):
            self.logger.warning(
                'there is a collinearity problem of the given list of SNPs. Try slct again'
            )
            cojo_input[cojo_input['SNP'].isin(cond_snps[ColName.SNPID])].to_csv(
                f"{temp_dir}/cojo_{chrom}.reslct.ma", sep=" ", index=False
            )
            cmd = [
                self.gcta,
                "--bfile",
                ldref,
                "--cojo-file",
                f"{temp_dir}/cojo_{chrom}.reslct.ma",
                "--diff-freq",
                "1",
                "--cojo-collinear",
                "0.9",
                "--cojo-slct",
                "--out",
                f"{temp_dir}/cojo_{chrom}.reslct",
            ]
            self.logger.debug(f"slct again: {' '.join(cmd)}")
            res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
            if res.returncode != 0:
                self.logger.error(res.stdout)
                raise RuntimeError(res.stdout)
            else:
                new_conds = pd.read_csv(
                    f"{temp_dir}/cojo_{chrom}.reslct.jma.cojo", delim_whitespace=True
                )
                new_conds = new_conds[new_conds['pJ'] < 5e-8]['SNP'].values
                with open(f"{temp_dir}/cojo_cond_{chrom}.snps", "w") as f:
                    f.write('\n'.join(new_conds))
                cmd = [
                    self.gcta,
                    "--bfile",
                    ldref,
                    "--cojo-file",
                    cojo_p_file,
                    "--diff-freq",
                    "1",
                    "--cojo-collinear",
                    "0.99",
                    "--cojo-cond",
                    f"{temp_dir}/cojo_cond_{chrom}.snps",
                    "--out",
                    cojo_outfile,
                ]
                self.logger.debug(f"conditional analysis: {' '.join(cmd)}")
                res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
                if res.returncode != 0:
                    self.logger.error(res.stdout)
                    raise RuntimeError(res.stdout)
        # self.logger.error(res.stdout)
        # raise RuntimeError(res.stdout)
    if os.path.exists(f"{cojo_outfile}.cma.cojo"):
        cond_res = pd.read_csv(
            f"{cojo_outfile}.cma.cojo", sep="\t", usecols=["SNP", "bC", "bC_se", "pC"]
        )
        cond_res.rename(
            columns={
                "SNP": ColName.SNPID,
                "bC": ColName.COJO_BETA,
                "bC_se": ColName.COJO_SE,
                "pC": ColName.COJO_P,
            },
            inplace=True,
        )
        output = sumstats.merge(cond_res, on=ColName.SNPID, how="left")
        output = output.dropna(subset=[ColName.COJO_P, ColName.COJO_BETA, ColName.COJO_SE])
        return output
    else:
        return sumstats

`extract(inprefix, outprefix, chrom, temp_dir=None, start=None, end=None, mac=10)` ¶

Extract the genotypes of given region from the LD reference.

Parameters:

Name	Type	Description	Default
`inprefix`	`str`	The input prefix.	required
`outprefix`	`str`	The output prefix.	required
`chrom`	`int`	The chromosome number.	required
`temp_dir`	`str`	The temporary directory.	`None`
`start`	`int`	The start position, by default None	`None`
`end`	`int`	The end position, by default None	`None`
`mac`	`int`	The minor allele count threshold, by default 10	`10`

Returns:

Type	Description
`None`

Source code in easyfinemap/ldref.py

@io_in_tempdir(dir="./tmp/ldref")
def extract(
    self,
    inprefix: str,
    outprefix: str,
    chrom: int,
    temp_dir: Optional[str] = None,
    start: Optional[int] = None,
    end: Optional[int] = None,
    mac: int = 10,
) -> None:
    """
    Extract the genotypes of given region from the LD reference.

    Parameters
    ----------
    inprefix : str
        The input prefix.
    outprefix : str
        The output prefix.
    chrom : int
        The chromosome number.
    temp_dir : str
        The temporary directory.
    start : int, optional
        The start position, by default None
    end : int, optional
        The end position, by default None
    mac: int, optional
        The minor allele count threshold, by default 10

    Returns
    -------
    None
    """
    region_file = f"{temp_dir}/{outprefix.split('/')[-1]}.region"
    if start is None:
        extract_cmd = ["--chr", str(chrom)]
    else:
        with open(region_file, "w") as f:
            f.write(f"{chrom}\t{start}\t{end}\tregion")
        extract_cmd = ["--extract", "range", region_file]

    if "{chrom}" in inprefix:
        inprefix = inprefix.replace("{chrom}", str(chrom))
    if not os.path.exists(f"{inprefix}.bed"):
        raise FileNotFoundError(f"{inprefix}.bed not found.")
    cmd = [
        self.plink,
        "--bfile",
        inprefix,
        *extract_cmd,
        "--keep-allele-order",
        "--mac",
        str(mac),
        "--make-bed",
        "--out",
        outprefix,
    ]
    res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
    self.logger.debug(' '.join(cmd))
    self.logger.debug(f"extract chr{chrom}:{start}-{end} from {inprefix}")
    if res.returncode != 0:
        self.logger.error(res.stderr)
        self.logger.error(f'see log file: {outprefix}.log for details')
        raise RuntimeError(res.stderr)

`intersect(sumstats, ldref, out_plink, use_ref_EAF=False, temp_dir=None)` ¶

Intersect the significant snps with the LD reference.

Parameters:

Name	Type	Description	Default
`sumstats`	`DataFrame`	The summary statistics.	required
`ldref`	`str`	The path to the LD reference file.	required
`out_plink`	`str`	The output prefix.	required
`use_ref_EAF`	`bool`	Use the EAF in the LD reference, by default False	`False`
`temp_dir`	`Optional[str]`	The path to the temporary directory, by default None	`None`

Returns:

Type	Description
`DataFrame`	The intersected significant snps.

Source code in easyfinemap/ldref.py

@io_in_tempdir(dir="./tmp/ldref")
def intersect(
    self,
    sumstats: pd.DataFrame,
    ldref: str,
    out_plink: str,
    use_ref_EAF: bool = False,
    temp_dir: Optional[str] = None,
) -> pd.DataFrame:
    """
    Intersect the significant snps with the LD reference.

    Parameters
    ----------
    sumstats : pd.DataFrame
        The summary statistics.
    ldref : str
        The path to the LD reference file.
    out_plink : str
        The output prefix.
    use_ref_EAF : bool, optional
        Use the EAF in the LD reference, by default False
    temp_dir : Optional[str], optional
        The path to the temporary directory, by default None

    Returns
    -------
    pd.DataFrame
        The intersected significant snps.
    """
    if not os.path.exists(f"{ldref}.bim"):
        raise FileNotFoundError(f"{ldref}.bim not found.")
    sumstats[ColName.SNPID].to_csv(f"{temp_dir}/overlap_snpid.txt", index=False, header=False)
    cmd = [
        self.plink,
        "--bfile",
        ldref,
        "--extract",
        f"{temp_dir}/overlap_snpid.txt",
        "--keep-allele-order",
        "--make-bed",
        "--out",
        out_plink,
    ]
    res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
    self.logger.debug(' '.join(cmd))
    self.logger.debug(f"intersect {sumstats.shape[0]} SNPs with {ldref}")
    if res.returncode != 0:
        self.logger.warning(res.stderr)
        self.logger.warning(f'see log file: {out_plink}.log for details')
        # raise RuntimeError(res.stderr)
        return pd.DataFrame()
    else:
        bim = pd.read_csv(
            f"{out_plink}.bim",
            delim_whitespace=True,
            names=[ColName.CHR, ColName.RSID, "cM", ColName.BP, ColName.EA, ColName.NEA],
        )
        overlap_sumstat = sumstats[sumstats[ColName.SNPID].isin(bim[ColName.RSID])].copy()
        overlap_sumstat.reset_index(drop=True, inplace=True)

        if use_ref_EAF:
            cmd = [
                self.plink,
                "--bfile",
                out_plink,
                "--freq",
                "--out",
                f"{temp_dir}/freq",
            ]
            res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
            self.logger.debug(f"calculate EAF of {out_plink}")
            self.logger.debug(f"calculate EAF: {' '.join(cmd)}")
            # if res.returncode != 0:
            #     self.logger.error(res.stderr)
            #     self.logger.error(f'see log file: {temp_dir}/freq.log for details')
            #     raise RuntimeError(res.stderr)
            freq = pd.read_csv(f"{temp_dir}/freq.frq", delim_whitespace=True)
            freq['A2_frq'] = 1 - freq['MAF']
            overlap_sumstat['EAF'] = freq['A2_frq'].where(
                freq['A2'] == overlap_sumstat['EA'], freq['MAF']
            )
            overlap_sumstat['MAF'] = freq['MAF']
        return overlap_sumstat

`make_ld(ldref, outprefix, **kwargs)` ¶

Make the LD matrix.

TODO: Calculate LD matrix using plink-pandas, because plink1.9 --ld contains bug.

Parameters:

Name	Type	Description	Default
`ldref`	`str`	The path to the LD reference file.	required
`outprefix`	`str`	The output prefix.	required

Raises:

Type	Description
`RuntimeError`	If the return code is not 0.

Returns:

Type	Description
`None`

Source code in easyfinemap/ldref.py

@io_in_tempdir('./tmp/ldref')
def make_ld(
    self,
    ldref: str,
    outprefix: str,
    **kwargs,
):
    """
    Make the LD matrix.

    TODO: Calculate LD matrix using plink-pandas, because plink1.9 --ld contains bug.

    Parameters
    ----------
    ldref : str
        The path to the LD reference file.
    outprefix : str
        The output prefix.

    Raises
    ------
    RuntimeError
        If the return code is not 0.

    Returns
    -------
    None
    """
    self.logger.info(f"Making LD matrix: {outprefix}")
    cmd = [
        self.plink,
        "--bfile",
        ldref,
        "--r2",
        "square",
        "spaces",
        "--threads",
        "1",
        "--out",
        outprefix,
    ]
    res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
    self.logger.debug(f"get LD matrix: {' '.join(cmd)}")
    if res.returncode != 0:
        self.logger.warning(res.stderr)
        self.logger.warning(f'see log file: {outprefix}.log for details')
    else:
        self.logger.debug("LD matrix is made")
        run(["sed", "-i", "s/nan/1e-6/g", f"{outprefix}.ld"])

`valid(ldref_path, outprefix, file_type='plink', mac=10, threads=1, temp_dir=None)` ¶

Validate the LD reference file.

TODO:1. format vcfs to plink files. 2. remove duplicated snps. 3. remove snps with MAC < mac. 4. make SNP names unique, chr-bp-sorted(EA,NEA). TODO:5. mark bim file with "#easyfinemap validated" flag in the first line.

Parameters:

Name	Type	Description	Default
`ldref_path`	`str`	The path to the LD reference file.	required
`outprefix`	`str`	The output prefix.	required
`file_type`	`str`	The file type of the LD reference file, by default "plink"	`'plink'`
`mac`	`int`	The minor allele count threshold, by default 10 SNPs with MAC < mac will be removed.	`10`
`threads`	`int`	The number of threads to use, by default 1	`1`
`temp_dir`	`Optional[str]`	The path to the temporary directory, by default None	`None`

Raises:

Type	Description
`ValueError`	If the file type is not supported.

Returns:

Type	Description
`None`

Source code in easyfinemap/ldref.py

@io_in_tempdir(dir='./tmp/ldref')
def valid(
    self,
    ldref_path: str,
    outprefix: str,
    file_type: str = "plink",
    mac: int = 10,
    threads: int = 1,
    temp_dir: Optional[str] = None,
) -> None:
    """
    Validate the LD reference file.

    TODO:1. format vcfs to plink files.
    2. remove duplicated snps.
    3. remove snps with MAC < mac.
    4. make SNP names unique, chr-bp-sorted(EA,NEA).
    TODO:5. mark bim file with "#easyfinemap validated" flag in the first line.

    Parameters
    ----------
    ldref_path : str
        The path to the LD reference file.
    outprefix : str
        The output prefix.
    file_type : str, optional
        The file type of the LD reference file, by default "plink"
    mac: int, optional
        The minor allele count threshold, by default 10
        SNPs with MAC < mac will be removed.
    threads : int, optional
        The number of threads to use, by default 1
    temp_dir : Optional[str], optional
        The path to the temporary directory, by default None

    Raises
    ------
    ValueError
        If the file type is not supported.

    Returns
    -------
    None
    """
    if file_type == "plink":
        self.file_type = file_type
    else:
        raise ValueError(f"Unsupported file type: {file_type}")

    params: List[List[Union[str, int]]] = [[] for _ in range(3)]
    for chrom in CHROMS:
        if "{chrom}" in ldref_path:
            inprefix = ldref_path.replace("{chrom}", str(chrom))
            if not os.path.exists(f"{inprefix}.bed"):
                if chrom == 23:
                    inprefix = ldref_path.replace("{chrom}", "X")
                    if os.path.exists(f"{inprefix}.bed"):
                        self.logger.warning(f"chr{chrom} not found, use X instead.")
                        params[0].append(inprefix)
                        params[1].append(f"{outprefix}.chr{chrom}")
                        params[2].append(mac)
                    else:
                        self.logger.warning(f"{inprefix}.bed not found.")
                else:
                    self.logger.warning(f"{inprefix}.bed not found.")
                    continue
            else:
                params[0].append(inprefix)
                params[1].append(f"{outprefix}.chr{chrom}")
                params[2].append(mac)
        else:
            inprefix = ldref_path
            if not os.path.exists(f"{inprefix}.bed"):
                raise FileNotFoundError(f"{inprefix}.bed not found.")
            else:
                # check if chrom is in the bim file
                res = check_output(
                    f'grep "^{chrom}[[:space:]]" {inprefix}.bim | head -n 1', shell=True
                )
                if len(res.decode()) == 0:
                    self.logger.warning(f"Chrom {chrom} not found in {inprefix}.bim")
                    continue
                else:
                    intermed_prefix = f"{temp_dir}/{outprefix.split('/')[-1]}.chr{chrom}"
                    self.extract(inprefix, intermed_prefix, chrom, mac=mac)
                    params[0].append(intermed_prefix)
                    params[1].append(f"{outprefix}.chr{chrom}")
                    params[2].append(mac)

    with Pool(threads) as p:
        p.map(self._clean_per_chr, *params)

LDRef

annotate_r2(sumstat, ldref, ld_snp, temp_dir=None) ¶

cojo_cond(sumstats, cond_snps, ldref, sample_size, use_ref_EAF=False, temp_dir=None) ¶

extract(inprefix, outprefix, chrom, temp_dir=None, start=None, end=None, mac=10) ¶

intersect(sumstats, ldref, out_plink, use_ref_EAF=False, temp_dir=None) ¶

make_ld(ldref, outprefix, **kwargs) ¶

valid(ldref_path, outprefix, file_type='plink', mac=10, threads=1, temp_dir=None) ¶

`annotate_r2(sumstat, ldref, ld_snp, temp_dir=None)` ¶

`cojo_cond(sumstats, cond_snps, ldref, sample_size, use_ref_EAF=False, temp_dir=None)` ¶

`extract(inprefix, outprefix, chrom, temp_dir=None, start=None, end=None, mac=10)` ¶

`intersect(sumstats, ldref, out_plink, use_ref_EAF=False, temp_dir=None)` ¶

`make_ld(ldref, outprefix, **kwargs)` ¶

`valid(ldref_path, outprefix, file_type='plink', mac=10, threads=1, temp_dir=None)` ¶