locus

Class for the input data of the fine-mapping analysis.

`Locus` ¶

Locus class to represent a genomic locus with associated summary statistics and linkage disequilibrium (LD) matrix.

Parameters¶

popu : str Population code. e.g. "EUR". Choose from ["AFR", "AMR", "EAS", "EUR", "SAS"]. cohort : str Cohort name. sample_size : int Sample size. sumstats : pd.DataFrame Summary statistics DataFrame. ld : Optional[LDMatrix], optional LD matrix, by default None. if_intersect : bool, optional Whether to intersect the LD matrix and summary statistics file, by default False.

Attributes¶

original_sumstats : pd.DataFrame The original summary statistics file. sumstats : pd.DataFrame The processed summary statistics file. ld : LDMatrix The LD matrix object. chrom : int Chromosome. start : int Start position of the locus. end : int End position of the locus. n_snps : int Number of SNPs in the locus. prefix : str The prefix combining population and cohort. locus_id : str Unique identifier for the locus. is_matched : bool Whether the LD matrix and summary statistics file are matched. lambda_s : Optional[float] The estimated lambda_s parameter from estimate_s_rss function, None if not calculated.

Notes¶

If no LD matrix is provided, only ABF method can be used for fine-mapping.

Source code in credtools/locus.py

class Locus:
    """
    Locus class to represent a genomic locus with associated summary statistics and linkage disequilibrium (LD) matrix.

    Parameters
    ----------
    popu : str
        Population code. e.g. "EUR". Choose from ["AFR", "AMR", "EAS", "EUR", "SAS"].
    cohort : str
        Cohort name.
    sample_size : int
        Sample size.
    sumstats : pd.DataFrame
        Summary statistics DataFrame.
    ld : Optional[LDMatrix], optional
        LD matrix, by default None.
    if_intersect : bool, optional
        Whether to intersect the LD matrix and summary statistics file, by default False.

    Attributes
    ----------
    original_sumstats : pd.DataFrame
        The original summary statistics file.
    sumstats : pd.DataFrame
        The processed summary statistics file.
    ld : LDMatrix
        The LD matrix object.
    chrom : int
        Chromosome.
    start : int
        Start position of the locus.
    end : int
        End position of the locus.
    n_snps : int
        Number of SNPs in the locus.
    prefix : str
        The prefix combining population and cohort.
    locus_id : str
        Unique identifier for the locus.
    is_matched : bool
        Whether the LD matrix and summary statistics file are matched.
    lambda_s : Optional[float]
        The estimated lambda_s parameter from estimate_s_rss function, None if not calculated.

    Notes
    -----
    If no LD matrix is provided, only ABF method can be used for fine-mapping.
    """

    def __init__(
        self,
        popu: str,
        cohort: str,
        sample_size: int,
        sumstats: pd.DataFrame,
        locus_start: int,
        locus_end: int,
        ld: Optional[LDMatrix] = None,
        if_intersect: bool = False,
    ) -> None:
        """
        Initialize the Locus object.

        Parameters
        ----------
        popu : str
            Population code. e.g. "EUR". Choose from ["AFR", "AMR", "EAS", "EUR", "SAS"].
        cohort : str
            Cohort name.
        sample_size : int
            Sample size.
        sumstats : pd.DataFrame
            Summary statistics DataFrame.
        locus_start : int
            Fixed start position for the locus.
        locus_end : int
            Fixed end position for the locus.
        ld : Optional[LDMatrix], optional
            LD matrix, by default None.
        if_intersect : bool, optional
            Whether to intersect the LD matrix and summary statistics file, by default False.

        Warnings
        --------
        If no LD matrix is provided, a warning is logged that only ABF method can be used.
        """
        self.sumstats = sumstats
        self._original_sumstats = self.sumstats.copy()
        self._popu = popu
        self._cohort = cohort
        self._sample_size = sample_size
        self._locus_start = locus_start
        self._locus_end = locus_end
        self.lambda_s = None
        if ld:
            self.ld = ld
            if if_intersect:
                inters = intersect_sumstat_ld(self)
                self.sumstats = inters.sumstats
                self.ld = inters.ld
        else:
            logger.warning("LD matrix and map file not found. Can only run ABF method.")
            self.ld = LDMatrix(pd.DataFrame(), np.array([]))

    @property
    def original_sumstats(self) -> pd.DataFrame:
        """Get the original sumstats file."""
        return self._original_sumstats

    @property
    def popu(self) -> str:
        """Get the population code."""
        return self._popu

    @property
    def cohort(self) -> str:
        """Get the cohort name."""
        return self._cohort

    @property
    def sample_size(self) -> int:
        """Get the sample size."""
        return self._sample_size

    @property
    def chrom(self) -> int:
        """Get the chromosome."""
        return self.sumstats[ColName.CHR].iloc[0]

    @property
    def start(self) -> int:
        """Get the start position."""
        return self._locus_start

    @property
    def end(self) -> int:
        """Get the end position."""
        return self._locus_end

    @property
    def n_snps(self) -> int:
        """Get the number of SNPs."""
        return len(self.sumstats)

    @property
    def prefix(self) -> str:
        """Get the prefix of the locus."""
        import hashlib

        # Check if this is a meta-analysis result (contains '+' in cohort)
        if "+" in self.cohort:
            # Create a hash of the cohort part
            cohort_hash = hashlib.md5(self.cohort.encode()).hexdigest()[:8]
            # Count number of cohorts for reference
            num_cohorts = len(self.cohort.split("+"))
            return f"{self.popu}_meta{num_cohorts}cohorts_{cohort_hash}"

        # For single cohort, use original format
        return f"{self.popu}_{self.cohort}"

    @property
    def locus_id(self) -> str:
        """Get the locus ID."""
        return f"{self.popu}_{self.cohort}_chr{self.chrom}:{self.start}-{self.end}"

    @property
    def is_matched(self) -> bool:
        """Check if the LD matrix and sumstats file are matched."""
        # check the order of SNPID in the LD matrix and the sumstats file are the exact same
        if self.ld is None:
            return False
        return self.ld.map[ColName.SNPID].equals(self.sumstats[ColName.SNPID])

    def __repr__(self) -> str:
        """
        Return a string representation of the Locus object.

        Returns
        -------
        str
            String representation of the Locus object.
        """
        return f"Locus(popu={self.popu}, cohort={self.cohort}, sample_size={self.sample_size}, chr={self.chrom}, start={self.start}, end={self.end}, sumstats={self.sumstats.shape}, ld={self.ld.r.shape})"

    def copy(self) -> "Locus":
        """
        Copy the Locus object.

        Returns
        -------
        Locus
            A copy of the Locus object.
        """
        new_locus = Locus(
            self.popu,
            self.cohort,
            self.sample_size,
            self.sumstats.copy(),
            self._locus_start,
            self._locus_end,
            self.ld.copy(),
            if_intersect=False,
        )
        new_locus.lambda_s = self.lambda_s
        return new_locus

`chrom` `property` ¶

Get the chromosome.

`cohort` `property` ¶

Get the cohort name.

`end` `property` ¶

Get the end position.

`is_matched` `property` ¶

Check if the LD matrix and sumstats file are matched.

`locus_id` `property` ¶

Get the locus ID.

`n_snps` `property` ¶

Get the number of SNPs.

`original_sumstats` `property` ¶

Get the original sumstats file.

`popu` `property` ¶

Get the population code.

`prefix` `property` ¶

Get the prefix of the locus.

`sample_size` `property` ¶

Get the sample size.

`start` `property` ¶

Get the start position.

`init(popu, cohort, sample_size, sumstats, locus_start, locus_end, ld=None, if_intersect=False)` ¶

Initialize the Locus object.

Parameters¶

popu : str Population code. e.g. "EUR". Choose from ["AFR", "AMR", "EAS", "EUR", "SAS"]. cohort : str Cohort name. sample_size : int Sample size. sumstats : pd.DataFrame Summary statistics DataFrame. locus_start : int Fixed start position for the locus. locus_end : int Fixed end position for the locus. ld : Optional[LDMatrix], optional LD matrix, by default None. if_intersect : bool, optional Whether to intersect the LD matrix and summary statistics file, by default False.

Warnings¶

If no LD matrix is provided, a warning is logged that only ABF method can be used.

Source code in credtools/locus.py

def __init__(
    self,
    popu: str,
    cohort: str,
    sample_size: int,
    sumstats: pd.DataFrame,
    locus_start: int,
    locus_end: int,
    ld: Optional[LDMatrix] = None,
    if_intersect: bool = False,
) -> None:
    """
    Initialize the Locus object.

    Parameters
    ----------
    popu : str
        Population code. e.g. "EUR". Choose from ["AFR", "AMR", "EAS", "EUR", "SAS"].
    cohort : str
        Cohort name.
    sample_size : int
        Sample size.
    sumstats : pd.DataFrame
        Summary statistics DataFrame.
    locus_start : int
        Fixed start position for the locus.
    locus_end : int
        Fixed end position for the locus.
    ld : Optional[LDMatrix], optional
        LD matrix, by default None.
    if_intersect : bool, optional
        Whether to intersect the LD matrix and summary statistics file, by default False.

    Warnings
    --------
    If no LD matrix is provided, a warning is logged that only ABF method can be used.
    """
    self.sumstats = sumstats
    self._original_sumstats = self.sumstats.copy()
    self._popu = popu
    self._cohort = cohort
    self._sample_size = sample_size
    self._locus_start = locus_start
    self._locus_end = locus_end
    self.lambda_s = None
    if ld:
        self.ld = ld
        if if_intersect:
            inters = intersect_sumstat_ld(self)
            self.sumstats = inters.sumstats
            self.ld = inters.ld
    else:
        logger.warning("LD matrix and map file not found. Can only run ABF method.")
        self.ld = LDMatrix(pd.DataFrame(), np.array([]))

`repr()` ¶

Return a string representation of the Locus object.

Returns¶

str String representation of the Locus object.

Source code in credtools/locus.py

def __repr__(self) -> str:
    """
    Return a string representation of the Locus object.

    Returns
    -------
    str
        String representation of the Locus object.
    """
    return f"Locus(popu={self.popu}, cohort={self.cohort}, sample_size={self.sample_size}, chr={self.chrom}, start={self.start}, end={self.end}, sumstats={self.sumstats.shape}, ld={self.ld.r.shape})"

`copy()` ¶

Copy the Locus object.

Returns¶

Locus A copy of the Locus object.

Source code in credtools/locus.py

def copy(self) -> "Locus":
    """
    Copy the Locus object.

    Returns
    -------
    Locus
        A copy of the Locus object.
    """
    new_locus = Locus(
        self.popu,
        self.cohort,
        self.sample_size,
        self.sumstats.copy(),
        self._locus_start,
        self._locus_end,
        self.ld.copy(),
        if_intersect=False,
    )
    new_locus.lambda_s = self.lambda_s
    return new_locus

`LocusSet` ¶

LocusSet class to represent a set of genomic loci.

Parameters¶

loci : List[Locus] List of Locus objects.

Attributes¶

loci : List[Locus] List of Locus objects. n_loci : int Number of loci. chrom : int Chromosome number. start : int Start position of the locus. end : int End position of the locus. locus_id : str Unique identifier for the locus.

Raises¶

ValueError If the chromosomes of the loci are not the same.

Source code in credtools/locus.py

class LocusSet:
    """
    LocusSet class to represent a set of genomic loci.

    Parameters
    ----------
    loci : List[Locus]
        List of Locus objects.

    Attributes
    ----------
    loci : List[Locus]
        List of Locus objects.
    n_loci : int
        Number of loci.
    chrom : int
        Chromosome number.
    start : int
        Start position of the locus.
    end : int
        End position of the locus.
    locus_id : str
        Unique identifier for the locus.

    Raises
    ------
    ValueError
        If the chromosomes of the loci are not the same.
    """

    def __init__(self, loci: List[Locus]) -> None:
        """
        Initialize the LocusSet object.

        Parameters
        ----------
        loci : List[Locus]
            List of Locus objects.
        """
        self.loci = loci

    @property
    def n_loci(self) -> int:
        """Get the number of loci."""
        return len(self.loci)

    @property
    def chrom(self) -> int:
        """
        Get the chromosome.

        Returns
        -------
        int
            Chromosome number.

        Raises
        ------
        ValueError
            If the chromosomes of the loci are not the same.
        """
        chrom_list = [locus.chrom for locus in self.loci]
        if len(set(chrom_list)) > 1:
            raise ValueError("The chromosomes of the loci are not the same.")
        return chrom_list[0]

    @property
    def start(self) -> int:
        """Get the start position."""
        # All loci should have the same fixed start
        starts = [locus._locus_start for locus in self.loci]
        if not all(s == starts[0] for s in starts):
            raise ValueError("All loci in LocusSet must have the same start position")
        return starts[0]

    @property
    def end(self) -> int:
        """Get the end position."""
        # All loci should have the same fixed end
        ends = [locus._locus_end for locus in self.loci]
        if not all(e == ends[0] for e in ends):
            raise ValueError("All loci in LocusSet must have the same end position")
        return ends[0]

    @property
    def locus_id(self) -> str:
        """Get the locus ID."""
        return f"{self.chrom}:{self.start}-{self.end}"

    def __repr__(self) -> str:
        """
        Return a string representation of the LocusSet object.

        Returns
        -------
        str
            String representation of the LocusSet object.
        """
        return (
            f"LocusSet(\n n_loci={len(self.loci)}, chrom={self.chrom}, start={self.start}, end={self.end}, locus_id={self.locus_id} \n"
            + "\n".join([locus.__repr__() for locus in self.loci])
            + "\n"
            + ")"
        )

    def copy(self) -> "LocusSet":
        """
        Copy the LocusSet object.

        Returns
        -------
        LocusSet
            A copy of the LocusSet object.
        """
        return LocusSet([locus.copy() for locus in self.loci])

`chrom` `property` ¶

Get the chromosome.

Returns¶

int Chromosome number.

Raises¶

ValueError If the chromosomes of the loci are not the same.

`end` `property` ¶

Get the end position.

`locus_id` `property` ¶

Get the locus ID.

`n_loci` `property` ¶

Get the number of loci.

`start` `property` ¶

Get the start position.

`init(loci)` ¶

Initialize the LocusSet object.

Parameters¶

loci : List[Locus] List of Locus objects.

Source code in credtools/locus.py

def __init__(self, loci: List[Locus]) -> None:
    """
    Initialize the LocusSet object.

    Parameters
    ----------
    loci : List[Locus]
        List of Locus objects.
    """
    self.loci = loci

`repr()` ¶

Return a string representation of the LocusSet object.

Returns¶

str String representation of the LocusSet object.

Source code in credtools/locus.py

def __repr__(self) -> str:
    """
    Return a string representation of the LocusSet object.

    Returns
    -------
    str
        String representation of the LocusSet object.
    """
    return (
        f"LocusSet(\n n_loci={len(self.loci)}, chrom={self.chrom}, start={self.start}, end={self.end}, locus_id={self.locus_id} \n"
        + "\n".join([locus.__repr__() for locus in self.loci])
        + "\n"
        + ")"
    )

`copy()` ¶

Copy the LocusSet object.

Returns¶

LocusSet A copy of the LocusSet object.

Source code in credtools/locus.py

def copy(self) -> "LocusSet":
    """
    Copy the LocusSet object.

    Returns
    -------
    LocusSet
        A copy of the LocusSet object.
    """
    return LocusSet([locus.copy() for locus in self.loci])

`check_loci_info(loci_info)` ¶

Check and validate loci information DataFrame.

Parameters¶

loci_info : pd.DataFrame DataFrame containing loci information.

Returns¶

pd.DataFrame Validated and type-corrected loci_info DataFrame.

Raises¶

ValueError If required columns are missing, data types are incorrect, or locus_id/boundary consistency checks fail.

Notes¶

This function performs the following checks: 1. Ensures all required columns are present 2. Validates and converts data types 3. Checks that loci with same locus_id have same chr, start, end 4. Validates chromosome, start, and end values

Source code in credtools/locus.py

def check_loci_info(loci_info: pd.DataFrame) -> pd.DataFrame:
    """
    Check and validate loci information DataFrame.

    Parameters
    ----------
    loci_info : pd.DataFrame
        DataFrame containing loci information.

    Returns
    -------
    pd.DataFrame
        Validated and type-corrected loci_info DataFrame.

    Raises
    ------
    ValueError
        If required columns are missing, data types are incorrect,
        or locus_id/boundary consistency checks fail.

    Notes
    -----
    This function performs the following checks:
    1. Ensures all required columns are present
    2. Validates and converts data types
    3. Checks that loci with same locus_id have same chr, start, end
    4. Validates chromosome, start, and end values
    """
    loci_info = loci_info.copy()

    # Check for required columns
    required_cols = [
        "prefix",
        "popu",
        "cohort",
        "sample_size",
        "chr",
        "start",
        "end",
        "locus_id",
    ]
    missing_cols = [col for col in required_cols if col not in loci_info.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    # Type checking and conversion
    try:
        # Convert numeric columns
        loci_info["sample_size"] = loci_info["sample_size"].astype(int)
        loci_info["chr"] = loci_info["chr"].astype(int)
        loci_info["start"] = loci_info["start"].astype(int)
        loci_info["end"] = loci_info["end"].astype(int)

        # Ensure string columns are strings
        loci_info["prefix"] = loci_info["prefix"].astype(str)
        loci_info["popu"] = loci_info["popu"].astype(str)
        loci_info["cohort"] = loci_info["cohort"].astype(str)
        loci_info["locus_id"] = loci_info["locus_id"].astype(str)

    except (ValueError, TypeError) as e:
        raise ValueError(f"Failed to convert data types: {e}")

    # Validate values
    if (loci_info["sample_size"] <= 0).any():
        raise ValueError("Sample size must be positive")

    if (loci_info["chr"] <= 0).any() or (loci_info["chr"] > 25).any():
        raise ValueError("Chromosome must be between 1 and 25")

    if (loci_info["start"] <= 0).any():
        raise ValueError("Start position must be positive")

    if (loci_info["end"] <= loci_info["start"]).any():
        raise ValueError("End position must be greater than start position")

    # Check for duplicates in popu+cohort+locus_id combination
    if loci_info.duplicated(subset=["popu", "cohort", "locus_id"]).any():
        raise ValueError("Each popu+cohort+locus_id combination must be unique")

    # Check consistency: same locus_id must have same chr, start, end
    locus_boundaries = loci_info.groupby("locus_id")[["chr", "start", "end"]].nunique()
    inconsistent_loci = locus_boundaries[(locus_boundaries > 1).any(axis=1)]

    if not inconsistent_loci.empty:
        raise ValueError(
            f"Inconsistent boundaries for locus_id(s): {inconsistent_loci.index.tolist()}. "
            "Each locus_id must have consistent chr, start, end values across all rows."
        )

    return loci_info

`intersect_loci(list_loci)` ¶

Intersect the Variant IDs in the LD matrices and the sumstats files of a list of Locus objects.

Parameters¶

list_loci : List[Locus] List of Locus objects.

Returns¶

List[Locus] List of Locus objects containing the intersected LD matrices and sumstats files.

Raises¶

NotImplementedError This function is not yet implemented.

Notes¶

This function is planned to intersect variant IDs across multiple loci to ensure consistent variant sets for multi-ancestry analysis.

Source code in credtools/locus.py

def intersect_loci(list_loci: List[Locus]) -> List[Locus]:
    """
    Intersect the Variant IDs in the LD matrices and the sumstats files of a list of Locus objects.

    Parameters
    ----------
    list_loci : List[Locus]
        List of Locus objects.

    Returns
    -------
    List[Locus]
        List of Locus objects containing the intersected LD matrices and sumstats files.

    Raises
    ------
    NotImplementedError
        This function is not yet implemented.

    Notes
    -----
    This function is planned to intersect variant IDs across multiple loci
    to ensure consistent variant sets for multi-ancestry analysis.
    """
    raise NotImplementedError(
        "Intersect the Variant IDs in the LD matrices and the sumstats files of a list of Locus objects."
    )

`intersect_sumstat_ld(locus)` ¶

Intersect the Variant IDs in the LD matrix and the sumstats file.

Parameters¶

locus : Locus Locus object containing LD matrix and summary statistics.

Returns¶

Locus Locus object containing the intersected LD matrix and sumstats file.

Raises¶

ValueError If LD matrix not found or no common Variant IDs found between the LD matrix and the sumstats file.

Warnings¶

If only a few common Variant IDs are found (≤ 10), a warning is logged.

Notes¶

This function performs the following operations:

Checks if LD matrix and summary statistics are already matched
Finds common SNP IDs between LD matrix and summary statistics
Subsets both datasets to common variants
Reorders data to maintain consistency
Returns a new Locus object with intersected data

Source code in credtools/locus.py

def intersect_sumstat_ld(locus: Locus) -> Locus:
    """
    Intersect the Variant IDs in the LD matrix and the sumstats file.

    Parameters
    ----------
    locus : Locus
        Locus object containing LD matrix and summary statistics.

    Returns
    -------
    Locus
        Locus object containing the intersected LD matrix and sumstats file.

    Raises
    ------
    ValueError
        If LD matrix not found or no common Variant IDs found between the LD matrix and the sumstats file.

    Warnings
    --------
    If only a few common Variant IDs are found (≤ 10), a warning is logged.

    Notes
    -----
    This function performs the following operations:

    1. Checks if LD matrix and summary statistics are already matched
    2. Finds common SNP IDs between LD matrix and summary statistics
    3. Subsets both datasets to common variants
    4. Reorders data to maintain consistency
    5. Returns a new Locus object with intersected data
    """
    if locus.ld is None:
        raise ValueError("LD matrix not found.")
    if locus.is_matched:
        logger.info("The LD matrix and sumstats file are matched.")
        return locus
    ldmap = locus.ld.map.copy()
    r = locus.ld.r.copy()
    sumstats = locus.sumstats.copy()
    sumstats = sumstats.sort_values([ColName.CHR, ColName.BP], ignore_index=True)
    intersec_sumstats = sumstats[
        sumstats[ColName.SNPID].isin(ldmap[ColName.SNPID])
    ].copy()
    intersec_variants = intersec_sumstats[ColName.SNPID].to_numpy()
    if len(intersec_variants) == 0:
        raise ValueError(
            f"No common Variant IDs found between the LD matrix and the sumstats file for locus {locus.locus_id}."
        )
    elif len(intersec_variants) <= 10:
        logger.warning(
            f"Only a few common Variant IDs found between the LD matrix and the sumstats file(<= 10) for locus {locus.locus_id}."
        )
    ldmap["idx"] = ldmap.index
    ldmap.set_index(ColName.SNPID, inplace=True, drop=False)
    ldmap = ldmap.loc[intersec_variants].copy()
    intersec_index = ldmap["idx"].to_numpy()
    r = r[intersec_index, :][:, intersec_index]
    intersec_sumstats.reset_index(drop=True, inplace=True)
    ldmap.drop("idx", axis=1, inplace=True)
    ldmap = ldmap.reset_index(drop=True)
    intersec_ld = LDMatrix(ldmap, r)
    logger.info(
        "Intersected the Variant IDs in the LD matrix and the sumstats file. "
        f"Number of common Variant IDs: {len(intersec_index)}"
    )
    return Locus(
        locus.popu,
        locus.cohort,
        locus.sample_size,
        intersec_sumstats,
        locus._locus_start,
        locus._locus_end,
        intersec_ld,
        if_intersect=False,
    )

`load_locus(prefix, popu, cohort, sample_size, locus_start, locus_end, if_intersect=False, calculate_lambda_s=False, **kwargs)` ¶

Load the input data of the fine-mapping analysis.

Parameters¶

prefix : str Prefix of the input files. popu : str Population of the input data. cohort : str Cohort of the input data. sample_size : int Sample size of the input data. locus_start : int Fixed start position for the locus. locus_end : int Fixed end position for the locus. if_intersect : bool, optional Whether to intersect the input data with the LD matrix, by default False. calculate_lambda_s : bool, optional Whether to calculate lambda_s parameter using estimate_s_rss function, by default False. **kwargs : Any Additional keyword arguments passed to loading functions.

Returns¶

Locus Locus object containing the input data.

Raises¶

ValueError If the required input files are not found.

Notes¶

The function looks for files with the following patterns:

Summary statistics: {prefix}.sumstat or {prefix}.sumstats.gz
LD matrix: {prefix}.ld or {prefix}.ld.npz
LD map: {prefix}.ldmap or {prefix}.ldmap.gz

All files are required for proper functioning.

Examples¶

locus = load_locus('EUR_study1', 'EUR', 'study1', 50000) print(f"Loaded locus with {locus.n_snps} SNPs") Loaded locus with 10000 SNPs

Source code in credtools/locus.py

def load_locus(
    prefix: str,
    popu: str,
    cohort: str,
    sample_size: int,
    locus_start: int,
    locus_end: int,
    if_intersect: bool = False,
    calculate_lambda_s: bool = False,
    **kwargs: Any,
) -> Locus:
    """
    Load the input data of the fine-mapping analysis.

    Parameters
    ----------
    prefix : str
        Prefix of the input files.
    popu : str
        Population of the input data.
    cohort : str
        Cohort of the input data.
    sample_size : int
        Sample size of the input data.
    locus_start : int
        Fixed start position for the locus.
    locus_end : int
        Fixed end position for the locus.
    if_intersect : bool, optional
        Whether to intersect the input data with the LD matrix, by default False.
    calculate_lambda_s : bool, optional
        Whether to calculate lambda_s parameter using estimate_s_rss function, by default False.
    **kwargs : Any
        Additional keyword arguments passed to loading functions.

    Returns
    -------
    Locus
        Locus object containing the input data.

    Raises
    ------
    ValueError
        If the required input files are not found.

    Notes
    -----
    The function looks for files with the following patterns:

    - Summary statistics: {prefix}.sumstat or {prefix}.sumstats.gz
    - LD matrix: {prefix}.ld or {prefix}.ld.npz
    - LD map: {prefix}.ldmap or {prefix}.ldmap.gz

    All files are required for proper functioning.

    Examples
    --------
    >>> locus = load_locus('EUR_study1', 'EUR', 'study1', 50000)
    >>> print(f"Loaded locus with {locus.n_snps} SNPs")
    Loaded locus with 10000 SNPs
    """
    if os.path.exists(f"{prefix}.sumstat"):
        sumstats_path = f"{prefix}.sumstat"
    elif os.path.exists(f"{prefix}.sumstats.gz"):
        sumstats_path = f"{prefix}.sumstats.gz"
    else:
        raise ValueError("Sumstats file not found.")

    sumstats = load_sumstats(sumstats_path, if_sort_alleles=True, **kwargs)
    if os.path.exists(f"{prefix}.ld"):
        ld_path = f"{prefix}.ld"
    elif os.path.exists(f"{prefix}.ld.npz"):
        ld_path = f"{prefix}.ld.npz"
    else:
        raise ValueError("LD matrix file not found.")
    if os.path.exists(f"{prefix}.ldmap"):
        ldmap_path = f"{prefix}.ldmap"
    elif os.path.exists(f"{prefix}.ldmap.gz"):
        ldmap_path = f"{prefix}.ldmap.gz"
    else:
        raise ValueError("LD map file not found.")
    ld = load_ld(ld_path, ldmap_path, if_sort_alleles=True, **kwargs)

    locus = Locus(
        popu,
        cohort,
        sample_size,
        sumstats,
        locus_start,
        locus_end,
        ld=ld,
        if_intersect=if_intersect,
    )

    if calculate_lambda_s:
        try:
            # Import here to avoid circular imports
            from credtools.qc import estimate_s_rss

            locus.lambda_s = estimate_s_rss(locus)
            logger.info(
                f"Calculated lambda_s for locus {locus.locus_id}: {locus.lambda_s}"
            )
        except Exception as e:
            logger.warning(
                f"Failed to calculate lambda_s for locus {locus.locus_id}: {e}"
            )
            locus.lambda_s = None

    return locus

`load_locus_set(locus_info, if_intersect=False, calculate_lambda_s=False, **kwargs)` ¶

Load the input data of the fine-mapping analysis for multiple loci.

Parameters¶

locus_info : pd.DataFrame DataFrame containing the locus information with required columns: ['prefix', 'popu', 'cohort', 'sample_size', 'chr', 'start', 'end', 'locus_id']. if_intersect : bool, optional Whether to intersect the input data with the LD matrix, by default False. calculate_lambda_s : bool, optional Whether to calculate lambda_s parameter using estimate_s_rss function, by default False. **kwargs : Any Additional keyword arguments passed to load_locus function.

Returns¶

LocusSet LocusSet object containing the input data.

Raises¶

ValueError If required columns are missing or if the combination of popu and cohort is not unique.

Notes¶

The locus_info DataFrame must contain the following columns:

prefix: File prefix for each locus
popu: Population code
cohort: Cohort name
sample_size: Sample size for the cohort
chr: Chromosome number
start: Start position of the locus
end: End position of the locus
locus_id: Locus identifier

All rows must have the same chr, start, end, locus_id values (representing the same locus).

Examples¶

locus_info = pd.DataFrame({ ... 'prefix': ['EUR_study1', 'ASN_study2'], ... 'popu': ['EUR', 'ASN'], ... 'cohort': ['study1', 'study2'], ... 'sample_size': [50000, 30000] ... }) locus_set = load_locus_set(locus_info) print(f"Loaded {locus_set.n_loci} loci") Loaded 2 loci

Source code in credtools/locus.py

def load_locus_set(
    locus_info: pd.DataFrame,
    if_intersect: bool = False,
    calculate_lambda_s: bool = False,
    **kwargs: Any,
) -> LocusSet:
    """
    Load the input data of the fine-mapping analysis for multiple loci.

    Parameters
    ----------
    locus_info : pd.DataFrame
        DataFrame containing the locus information with required columns:
        ['prefix', 'popu', 'cohort', 'sample_size', 'chr', 'start', 'end', 'locus_id'].
    if_intersect : bool, optional
        Whether to intersect the input data with the LD matrix, by default False.
    calculate_lambda_s : bool, optional
        Whether to calculate lambda_s parameter using estimate_s_rss function, by default False.
    **kwargs : Any
        Additional keyword arguments passed to load_locus function.

    Returns
    -------
    LocusSet
        LocusSet object containing the input data.

    Raises
    ------
    ValueError
        If required columns are missing or if the combination of popu and cohort is not unique.

    Notes
    -----
    The locus_info DataFrame must contain the following columns:

    - prefix: File prefix for each locus
    - popu: Population code
    - cohort: Cohort name
    - sample_size: Sample size for the cohort
    - chr: Chromosome number
    - start: Start position of the locus
    - end: End position of the locus
    - locus_id: Locus identifier

    All rows must have the same chr, start, end, locus_id values (representing the same locus).

    Examples
    --------
    >>> locus_info = pd.DataFrame({
    ...     'prefix': ['EUR_study1', 'ASN_study2'],
    ...     'popu': ['EUR', 'ASN'],
    ...     'cohort': ['study1', 'study2'],
    ...     'sample_size': [50000, 30000]
    ... })
    >>> locus_set = load_locus_set(locus_info)
    >>> print(f"Loaded {locus_set.n_loci} loci")
    Loaded 2 loci
    """
    # Check and validate the locus_info DataFrame
    locus_info = check_loci_info(locus_info)

    # Check that all rows have the same chr, start, end (same locus)
    if len(locus_info["chr"].unique()) > 1:
        raise ValueError("All rows must have the same chromosome")
    if len(locus_info["start"].unique()) > 1:
        raise ValueError("All rows must have the same start position")
    if len(locus_info["end"].unique()) > 1:
        raise ValueError("All rows must have the same end position")
    if len(locus_info["locus_id"].unique()) > 1:
        raise ValueError("All rows must have the same locus_id")

    # Additional check for load_locus_set: popu+cohort must be unique within this single locus
    if locus_info.duplicated(subset=["popu", "cohort"]).any():
        raise ValueError(
            "Each popu+cohort combination must be unique within a single locus"
        )

    loci = []
    for i, row in locus_info.iterrows():
        loci.append(
            load_locus(
                row["prefix"],
                row["popu"],
                row["cohort"],
                row["sample_size"],
                int(row["start"]),
                int(row["end"]),
                if_intersect,
                calculate_lambda_s,
                **kwargs,
            )
        )
    return LocusSet(loci)

locus

Locus ¶

Parameters¶

Attributes¶

Notes¶

chrom property ¶

cohort property ¶

end property ¶

is_matched property ¶

locus_id property ¶

n_snps property ¶

original_sumstats property ¶

popu property ¶

prefix property ¶

sample_size property ¶

start property ¶

__init__(popu, cohort, sample_size, sumstats, locus_start, locus_end, ld=None, if_intersect=False) ¶

Parameters¶

Warnings¶

__repr__() ¶

Returns¶

copy() ¶

Returns¶

LocusSet ¶

Parameters¶

Attributes¶

Raises¶

chrom property ¶

Returns¶

Raises¶

end property ¶

locus_id property ¶

n_loci property ¶

start property ¶

__init__(loci) ¶

Parameters¶

__repr__() ¶

Returns¶

copy() ¶

Returns¶

check_loci_info(loci_info) ¶

Parameters¶

Returns¶

Raises¶

Notes¶

intersect_loci(list_loci) ¶

Parameters¶

Returns¶

Raises¶

Notes¶

intersect_sumstat_ld(locus) ¶

Parameters¶

Returns¶

Raises¶

Warnings¶

Notes¶

load_locus(prefix, popu, cohort, sample_size, locus_start, locus_end, if_intersect=False, calculate_lambda_s=False, **kwargs) ¶

Parameters¶

Returns¶

Raises¶

Notes¶

Examples¶

load_locus_set(locus_info, if_intersect=False, calculate_lambda_s=False, **kwargs) ¶

Parameters¶

Returns¶

Raises¶

Notes¶

Examples¶

`Locus` ¶

`chrom` `property` ¶

`cohort` `property` ¶

`end` `property` ¶

`is_matched` `property` ¶

`locus_id` `property` ¶

`n_snps` `property` ¶

`original_sumstats` `property` ¶

`popu` `property` ¶

`prefix` `property` ¶

`sample_size` `property` ¶

`start` `property` ¶

`init(popu, cohort, sample_size, sumstats, locus_start, locus_end, ld=None, if_intersect=False)` ¶

`repr()` ¶

`copy()` ¶

`LocusSet` ¶

`chrom` `property` ¶

`end` `property` ¶

`locus_id` `property` ¶

`n_loci` `property` ¶

`start` `property` ¶

`init(loci)` ¶

`repr()` ¶

`copy()` ¶

`check_loci_info(loci_info)` ¶

`intersect_loci(list_loci)` ¶

`intersect_sumstat_ld(locus)` ¶

`load_locus(prefix, popu, cohort, sample_size, locus_start, locus_end, if_intersect=False, calculate_lambda_s=False, **kwargs)` ¶

`load_locus_set(locus_info, if_intersect=False, calculate_lambda_s=False, **kwargs)` ¶