Commit 13da59b8159d190101cd5fea619ae805b2855b2f

Authored by Jean-Michel Garant
1 parent afd0c15b
Exists in stable-0.3 and in 1 other branch master

merge.py filters and merges, columns are aggragated

Showing 2 changed files with 134 additions and 25 deletions   Show diff stats
merge.py
... ... @@ -16,12 +16,19 @@
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18  
  19 +# temporary warning filter until packages with numpy dependancies updates
  20 +import warnings
  21 +warnings.filterwarnings("ignore", message="numpy.dtype size changed")
  22 +warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
  23 +
19 24 import os
20 25 import sys
21 26 import utils
22 27 import argparse
23 28 import pandas as pd
24 29  
  30 +pd.set_option('display.max_columns', None)
  31 +
25 32 class float_range(object):
26 33 """
27 34 Object that defines a range of float that is authorized or returns the
... ... @@ -40,29 +47,117 @@ class float_range(object):
40 47 def __repr__(self):
41 48 return '[{0}:{1}]'.format(self.start, self.end)
42 49  
43   -def custom(sub_df):
44   - return list(sub_df.sequence)
45   - #return len(''.join(sub_df.sequence))
  50 +#def too_much_merge_g4rna(df, window=60, step=10,
  51 +# cGcC=False, G4H=False, G4NN=False):
  52 +# """
  53 +# Merges consecutive windows that are scored above the provided thresholds
  54 +# into a single hit sequence and discards windows below the thresholds.
  55 +# Three level of verification that windows are consecutives:
  56 +#
  57 +# description column:
  58 +# For window with index n, consecutive windows share the same description
  59 +# description(n) == description(n+1)
  60 +#
  61 +# start column:
  62 +# For window with index n, the consecutive windows have
  63 +# start(n+1) - start(n) = step
  64 +#
  65 +# sequence column:
  66 +# For window with index n and length l, the consecutive windows have
  67 +# Last part of sequence(n) == first part of sequence(n+1)
  68 +# sequence(n)[step-l:] == sequence(n+1)[:l-step]
  69 +# """
  70 +# verif = []
  71 +# if 'description' in df.columns:
  72 +# verif.append(set(df.index[
  73 +# df.description.eq(df.description.shift(-1)) |
  74 +# df.description.shift().eq(df.description)].tolist()))
  75 +# #verif.append('description')
  76 +# # start and next start are distanced by step length new column True
  77 +# if 'start' in df.columns:
  78 +# verif.append(set(df.index[
  79 +# df.start.eq(df.start.shift()+step) |
  80 +# df.start.shift(-1).eq(df.start+step)].tolist()))
  81 +# #verif.append('start')
  82 +# # overlap is the length that sequential windows should share
  83 +# overlap = window-step
  84 +# if 'sequence' in df.columns:
  85 +# verif.append(set(df.index[
  86 +# df.sequence.str[-overlap:].eq(
  87 +# df.sequence.str[:overlap].shift(-1)) |
  88 +# df.sequence.str[-overlap:].shift().eq(
  89 +# df.sequence.str[:overlap])].tolist()))
  90 +# #verif.append('sequence')
  91 +# if len(verif) == 3:
  92 +# keep = verif.pop(0).intersection(verif.pop(0),verif.pop(0))
  93 +# elif len(verif) == 2:
  94 +# keep = verif.pop(0).intersection(verif.pop(0))
  95 +# elif len(verif) == 1:
  96 +# keep = verif.pop(0)
  97 +#
  98 +# return df
46 99  
47 100 def merge_g4rna(df, window=60, step=10,
48   - cGcC=False, G4H=False, G4NN=False):
49   -# df = df.groupby(lambda x: group_func(df, x-1),
50   -# as_index=True, sort=False
51   -# )['start','sequence','cGcC','G4H','G4NN'].apply(
52   -# custom).reset_index()
53   -# df = df.apply(lambda x: annotate(df, x), axis=1)
54   - if 'start' in df.columns:
55   - df['start_match'] = (
56   - df.start.eq(df.start.shift()+step) |
57   - df.start.shift(-1).eq(df.start+step))
  101 + cGcC=False, G4H=False, G4NN=False,
  102 + score_aggregation=list):
  103 + """
  104 + """
  105 + if cGcC:
  106 + df = df[ df.cGcC >= cGcC ].dropna()
  107 + if G4H:
  108 + df = df[ df.G4H >= G4H ].dropna()
  109 + if G4NN:
  110 + df = df[ df.G4NN >= G4NN ].dropna()
  111 + agg_fct = {
  112 +# 'description':"".join,
  113 + 'gene_symbol':'max',
  114 + 'mrnaAcc':'mode',
  115 + 'protAcc':'mode',
  116 + 'gene_stable_id':'mode',
  117 + 'transcript_stable_id':'mode',
  118 + 'full_name':'max',
  119 + 'HGNC_id':'mode',
  120 + 'identifier':'mode',
  121 + 'source':'mode',
  122 + 'genome_assembly':'mode',
  123 + 'chromosome':'mode',
  124 + 'start':'min',
  125 + 'end':'max',
  126 + 'strand':'mode',
  127 + 'range':'mode',
  128 + 'length':'max',
  129 + 'sequence':'max',
  130 + 'cGcC':score_aggregation,
  131 + 'G4H':score_aggregation,
  132 + 'G4NN':score_aggregation
  133 + }
  134 + # overlap is the length that sequential windows should share
  135 + overlap = window-step
58 136 if 'sequence' in df.columns:
59   - df['seq_match'] = (
60   - df.sequence.str[-50:].eq(df.sequence.str[:50].shift(-1)) |
61   - df.sequence.str[-50:].shift().eq(df.sequence.str[:50]))
62   -# df = df.groupby(['start_match','seq_match'])
63   - print df
64   -
65   - #return df
  137 + for ite in [1,2,3,4,5,6]:
  138 + df.loc[
  139 + df.sequence.str[-overlap:].eq(
  140 + df.sequence.str[:overlap].shift(-ite)),
  141 + 'sequence'] = df.sequence.str[:] + \
  142 + df.sequence.str[overlap:].shift(-ite)
  143 + if 'description' in df.columns:
  144 + df_grouped = df.groupby(
  145 + [df.description,df.sequence.str[-overlap:]],
  146 + sort=False,
  147 + as_index=False)
  148 + print "******",{k:agg_fct[k] for k in df.columns.drop(['description'])}
  149 + return df_grouped.agg({k:agg_fct[k] for k in df.columns.drop(['description'])}).reindex(columns=df.columns)
  150 + else:
  151 + df_grouped = df.groupby(
  152 + df.sequence.str[-overlap:],
  153 + sort=False,
  154 + as_index=False)
  155 + return df_grouped.agg(
  156 + {k:agg_fct[k] for k in df.columns},
  157 + ).reindex(df.columns, axis=1)
  158 + else:
  159 + sys.stderr.write("UsageError: 'sequence' column must be provided\n")
  160 + sys.exit()
66 161  
67 162 def arguments():
68 163 """
... ... @@ -110,7 +205,16 @@ def arguments():
110 205 help="Use G4NN score threshold to determine positive windows "\
111 206 "(default: 0.5)",
112 207 metavar="FLOAT")
  208 + # aggregation function for scores
  209 + parser.add_argument("-a", "--aggregation",
  210 + #nargs="+",
  211 + choices=["max","min","median","mean","std","sem",list],
  212 + default=list,
  213 + help="Aggregation function to pool scores of merged windows "\
  214 + "(default: list)",
  215 + metavar="STR")
113 216 args = parser.parse_args()
  217 +# args.aggregation = [ aggr if ( aggr != 'list' ) else list for aggr in args.aggregation ]
114 218 if args.cGcC == None:
115 219 args.cGcC = 4.5
116 220 if args.G4H == None:
... ... @@ -122,11 +226,11 @@ def arguments():
122 226 def main():
123 227 args = arguments()
124 228 g4rna_frame = pd.read_csv(args.tsv, sep='\t', index_col=0)
125   -# 10,
126 229 merge_g4rna(g4rna_frame,
127 230 60, 10,
128   - args.cGcC, args.G4H, args.G4NN)#.to_csv(
129   -# path_or_buf=sys.stdout, sep='\t')
  231 + args.cGcC, args.G4H, args.G4NN,
  232 + args.aggregation).to_csv(
  233 + path_or_buf=sys.stdout, sep='\t')
130 234  
131 235 if __name__ == '__main__':
132 236 main()
... ...
screen.py
... ... @@ -16,6 +16,11 @@
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18  
  19 +# temporary warning filter until packages with numpy dependancies updates
  20 +import warnings
  21 +warnings.filterwarnings("ignore", message="numpy.dtype size changed")
  22 +warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
  23 +
19 24 import os
20 25 import sys
21 26 import pickle
... ... @@ -308,8 +313,7 @@ def main():
308 313 # custom help message to list columns choices
309 314 if args.columns == ["list"]:
310 315 splitted_help = parser.format_help().split(
311   - ". To browse available columns use:\n\
312   - -c list (default: description)")
  316 + " Columns to display (default: description sequence\n start cGcC G4H G4NN). To browse available columns use:\n -c list")
313 317 print("\n\t".join([splitted_help[0],
314 318 "Available columns:",
315 319 "description\t\tDescription as available in fasta (Default)",
... ... @@ -336,6 +340,7 @@ def main():
336 340 "G4NN \t\tG4NN score of similitude",
337 341 " \t\t(must be specified to use ANN)",
338 342 splitted_help[1]]))
  343 + sys.exit()
339 344 # restrictive verifications for bedgraph options
340 345 if args.bedgraph and (
341 346 len(args.columns) != 4\
... ...