Manhattan & QQ plot

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec
from scipy.stats.mstats import mquantiles
from scipy.stats import beta
from scipy.stats import linregress
sum_stat = '/f/jianhua/nankai-hic/fine-mapping/prostate_cancer.txt'
chrom, bp, p = 'CHR','BP','P'
sep = '\t'
df = pd.read_csv(sum_stat,sep=sep)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-2-949a920fa523> in <module>
      2 chrom, bp, p = 'CHR','BP','P'
      3 sep = '\t'
----> 4 df = pd.read_csv(sum_stat,sep=sep)

~/anaconda3/envs/jb-book/lib/python3.8/site-packages/pandas/io/parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    608     kwds.update(kwds_defaults)
    609 
--> 610     return _read(filepath_or_buffer, kwds)
    611 
    612 

~/anaconda3/envs/jb-book/lib/python3.8/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    460 
    461     # Create the parser.
--> 462     parser = TextFileReader(filepath_or_buffer, **kwds)
    463 
    464     if chunksize or iterator:

~/anaconda3/envs/jb-book/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    817             self.options["has_index_names"] = kwds["has_index_names"]
    818 
--> 819         self._engine = self._make_engine(self.engine)
    820 
    821     def close(self):

~/anaconda3/envs/jb-book/lib/python3.8/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1048             )
   1049         # error: Too many arguments for "ParserBase"
-> 1050         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1051 
   1052     def _failover_to_python(self):

~/anaconda3/envs/jb-book/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1865 
   1866         # open handles
-> 1867         self._open_handles(src, kwds)
   1868         assert self.handles is not None
   1869         for key in ("storage_options", "encoding", "memory_map", "compression"):

~/anaconda3/envs/jb-book/lib/python3.8/site-packages/pandas/io/parsers.py in _open_handles(self, src, kwds)
   1360         Let the readers open IOHanldes after they are done with their potential raises.
   1361         """
-> 1362         self.handles = get_handle(
   1363             src,
   1364             "r",

~/anaconda3/envs/jb-book/lib/python3.8/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    640                 errors = "replace"
    641             # Encoding
--> 642             handle = open(
    643                 handle,
    644                 ioargs.mode,

FileNotFoundError: [Errno 2] No such file or directory: '/f/jianhua/nankai-hic/fine-mapping/prostate_cancer.txt'

Reduce size

df['CHR'] = df['CHR'].replace('X',23)
df['CHR'] = df['CHR'].astype(int)
df['P'] = -np.log10(df['P'])
df['P_down'] = df['P'].round(1)
df['P'] = df['P'].round(3)
df['BP_down'] = df['BP']/5e6
df['BP_down'] = df['BP_down'].astype(int)
idx = np.random.permutation(np.arange(len(df)))
df = df.iloc[idx].drop_duplicates(['CHR','P_down','BP_down'])
df = df.sort_values(['CHR','BP'])
df['P'] = 10 ** -df['P']
def qq(data,ax,color):
    xmax = 0
    ymax = 0
    alpha = 0.9
    color = '#000000'
    n_quantiles = 100

    q_pos = np.concatenate([
        np.arange(99.) / len(data),
        np.logspace(-np.log10(len(data)) + 2, 0, n_quantiles)
    ])

    q_data = mquantiles(data, prob=q_pos, alphap=0, betap=1, limit=(0, 1))
    q_th = q_pos.copy()
    q_err = np.zeros([len(q_pos), 2])
    for i in range(0, len(q_pos)):
        q_err[i, :] = q_err[i, :] = beta.interval(
            alpha,
            len(data) * q_pos[i],
            len(data) - len(data) * q_pos[i])

    q_err[i, q_err[i, :] < 0] = 1e-15
    slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data)
    xmax = np.max([xmax, -np.log10(q_th[1])])
    ymax = np.max([ymax, -np.log10(q_data[0])])

    ax.plot(
        -np.log10(q_th[n_quantiles - 1:]),
        -np.log10(q_data[n_quantiles - 1:]),
        '-',
        color=color)
    ax.plot(
        -np.log10(q_th[:n_quantiles]),
        -np.log10(q_data[:n_quantiles]),
        '.',
        color=color,
        label='gf')
    ax.plot([0, xmax], [0, xmax], '--k',color='#f42e30')
    ax.fill_between(
        -np.log10(q_th),
        -np.log10(q_err[:, 0]),
        -np.log10(q_err[:, 1]),
        color=color,
        alpha=0.1,
    )
def manhattan(df,ax):
    df[p] = -np.log10(df[p])
    df = df.sort_values(chrom)
    df_grouped = df.groupby((chrom))

    colors = ['#1A1A1A','#999999',]
    x_labels = []
    x_labels_pos = []
    end = 1000
    for num, (name, group) in enumerate(df_grouped):
        group[bp] = group[bp] + end
        end = group[bp].max() + 1000
        ax.scatter(group[bp], group[p],c=colors[num % len(colors)],s=1)
        x_labels.append(name)
        x_labels_pos.append(group[bp].mean())
    ax.axhline(y=-np.log10(5e-8), color='#2222FF', linestyle='-')
    ax.set_xticks(x_labels_pos)
    ax.set_xticklabels(x_labels)
#     print(df.loc[0,bp]-len(df)*0.1,end+len(df)*0.1)
#     ax.set_ylim([-0.5, df[p].max()*1.05])
# df = alldf.copy()
figure_tile = 'PH-277'
fig = plt.figure(figsize=(24, 6))
gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1])
ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
qq(df[p], ax1, 'b')
manhattan(df,ax0)
# ax0.set_xlim(left=-3e7,right=2.9e9)
ax0.set_xlabel('Chromosome', fontsize=14)
ax0.set_ylabel('-$\mathregular{log_{10}}$P', fontsize=14)
ax1.set_xlabel('Observed -$\mathregular{log_{10}}$P', fontsize=14)
ax1.set_ylabel('Expected -$\mathregular{log_{10}}$P', fontsize=14)
ax0.spines['right'].set_visible(False)
ax0.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
fig.suptitle(figure_tile, fontsize=20)
fig.tight_layout()
fig.savefig('{}_Manhattan_QQ.pdf'.format(figure_tile), dpi=300)
/f/jianhua/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in less
/f/jianhua/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:33: RuntimeWarning: divide by zero encountered in log10
/f/jianhua/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:40: RuntimeWarning: divide by zero encountered in log10
/f/jianhua/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
../_images/manhattan_qq_7_1.png