Compare View

switch
from
...
to
 
Commits (3)
g4base.py
... ... @@ -16,7 +16,9 @@
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18  
19   -from utils import *
  19 +import regex
  20 +import utils
  21 +import pandas as pd
20 22 from pybrain.datasets import ClassificationDataSet
21 23  
22 24  
... ... @@ -82,7 +84,7 @@ def gen_G4RNA_df(
82 84 """
83 85 data = []
84 86 for key in seq_dict.keys():
85   - infos = format_description(key)
  87 + infos = utils.format_description(key)
86 88 content = {}
87 89 for ke in infos.keys():
88 90 content[ke] = infos[ke]
... ... @@ -98,8 +100,8 @@ def gen_G4RNA_df(
98 100 content["transcript_id"],
99 101 content["gene_symbol"],
100 102 content["gene_description"]
101   - ] = retrieve_xref_Ensembl(infos.get('stable_id'),
102   - infos.get('mrnaAcc'), retrieve_RefSeq(
  103 + ] = utils.retrieve_xref_Ensembl(infos.get('stable_id'),
  104 + infos.get('mrnaAcc'), utils.retrieve_RefSeq(
103 105 infos.get('mrnaAcc'),infos.get('protAcc')
104 106 )[2])
105 107 [content["full_name"],
... ... @@ -121,14 +123,15 @@ def gen_G4RNA_df(
121 123 content['protAcc'],
122 124 content['gene_symbol'],
123 125 content['product']
124   - ] = retrieve_RefSeq(infos["mrnaAcc"],infos["protAcc"])
  126 + ] = utils.retrieve_RefSeq(
  127 + infos["mrnaAcc"],infos["protAcc"])
125 128 except:
126 129 try:
127 130 [content['mrnaAcc'],
128 131 content['protAcc'],
129 132 content['gene_symbol'],
130 133 content['product']
131   - ] = retrieve_RefSeq(content["mrnaAcc"])
  134 + ] = utils.retrieve_RefSeq(content["mrnaAcc"])
132 135 except:
133 136 pass
134 137 for ke in infos.keys():
... ... @@ -175,7 +178,7 @@ def gen_G4RNA_df(
175 178 except:
176 179 row.append(None)
177 180 data.append(row)
178   - verbosify(verbose, "DataFrame built")
  181 + utils.verbosify(verbose, "DataFrame built")
179 182 return pd.DataFrame(data=data, index=range(
180 183 first_id, first_id+len(data)), columns=columns)
181 184  
... ... @@ -204,7 +207,7 @@ def submit_seq(
204 207 test_results = ann.activateOnDataset(alldata_tst)
205 208 final_df = df_.copy()
206 209 final_df[score_name]= pd.Series(test_results[:,1], index=final_df.index)
207   - verbosify(verbose, 'Sequence submitted')
  210 + utils.verbosify(verbose, 'Sequence submitted')
208 211 return final_df.drop(
209 212 [c for c in final_df.columns if c not in except_columns][:-1],
210 213 axis=1)
... ...
merge.py 0 → 100755
... ... @@ -0,0 +1,132 @@
  1 +#!/usr/bin/env python
  2 +
  3 +# Identification of potential RNA G-quadruplexes by G4RNA screener.
  4 +# Copyright (C) 2018 Jean-Michel Garant
  5 +#
  6 +# This program is free software: you can redistribute it and/or modify
  7 +# it under the terms of the GNU General Public License as published by
  8 +# the Free Software Foundation, either version 3 of the License, or
  9 +# (at your option) any later version.
  10 +#
  11 +# This program is distributed in the hope that it will be useful,
  12 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 +# GNU General Public License for more details.
  15 +#
  16 +# You should have received a copy of the GNU General Public License
  17 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 +
  19 +import os
  20 +import sys
  21 +import utils
  22 +import argparse
  23 +import pandas as pd
  24 +
  25 +class float_range(object):
  26 + """
  27 + Object that defines a range of float that is authorized or returns the
  28 + authorized range in error. Used to validate score threshold input by user.
  29 + """
  30 + def __init__(self, start, end):
  31 + self.start = start
  32 + self.end = end
  33 + def __eq__(self, other):
  34 + if self.start == None:
  35 + return other <= self.end
  36 + elif self.end == None:
  37 + return self.start <= other
  38 + else:
  39 + return self.start <= other <= self.end
  40 + def __repr__(self):
  41 + return '[{0}:{1}]'.format(self.start, self.end)
  42 +
  43 +def custom(sub_df):
  44 + return list(sub_df.sequence)
  45 + #return len(''.join(sub_df.sequence))
  46 +
  47 +def merge_g4rna(df, window=60, step=10,
  48 + cGcC=False, G4H=False, G4NN=False):
  49 +# df = df.groupby(lambda x: group_func(df, x-1),
  50 +# as_index=True, sort=False
  51 +# )['start','sequence','cGcC','G4H','G4NN'].apply(
  52 +# custom).reset_index()
  53 +# df = df.apply(lambda x: annotate(df, x), axis=1)
  54 + if 'start' in df.columns:
  55 + df['start_match'] = (
  56 + df.start.eq(df.start.shift()+step) |
  57 + df.start.shift(-1).eq(df.start+step))
  58 + if 'sequence' in df.columns:
  59 + df['seq_match'] = (
  60 + df.sequence.str[-50:].eq(df.sequence.str[:50].shift(-1)) |
  61 + df.sequence.str[-50:].shift().eq(df.sequence.str[:50]))
  62 +# df = df.groupby(['start_match','seq_match'])
  63 + print df
  64 +
  65 + #return df
  66 +
  67 +def arguments():
  68 + """
  69 + Arguments management
  70 + """
  71 + # declare argument parser
  72 + parser = argparse.ArgumentParser(formatter_class=utils.Formatter,
  73 + prog=os.path.basename(__file__),
  74 + description="Merge positive windows of screen.py output and "\
  75 + "discard windows below the threshold(s)",
  76 + epilog="G4RNA screener Copyright (C) 2018 Jean-Michel Garant "\
  77 + "This program comes with ABSOLUTELY NO WARRANTY. This is free "\
  78 + "software, and you are welcome to redistribute it under certain "\
  79 + "conditions <http://www.gnu.org/licenses/>.")
  80 + # TSV input from STDIN is supported by default using argument "-"
  81 + parser.add_argument('tsv',
  82 + type=argparse.FileType('r'),
  83 + default=sys.stdin,
  84 + help='TSV file (tab separated values), - for default STDIN',
  85 + metavar='TSV')
  86 + # Use cGcC
  87 + parser.add_argument("--cGcC",
  88 + type=float,
  89 + nargs='?',
  90 + choices=[float_range(0,None)],
  91 + default=False,
  92 + help="Use cGcC score threshold to determine positive windows "\
  93 + "(default: 4.5)",
  94 + metavar="FLOAT")
  95 + # Use G4H
  96 + parser.add_argument("--G4H",
  97 + type=float,
  98 + nargs='?',
  99 + choices=[float_range(-4,4)],
  100 + default=False,
  101 + help="Use G4Hunter score threshold to determine positive windows "\
  102 + "(default: 0.9)",
  103 + metavar="FLOAT")
  104 + # Use G4NN
  105 + parser.add_argument("--G4NN",
  106 + type=float,
  107 + nargs='?',
  108 + choices=[float_range(0,1)],
  109 + default=False,
  110 + help="Use G4NN score threshold to determine positive windows "\
  111 + "(default: 0.5)",
  112 + metavar="FLOAT")
  113 + args = parser.parse_args()
  114 + if args.cGcC == None:
  115 + args.cGcC = 4.5
  116 + if args.G4H == None:
  117 + args.G4H = 0.9
  118 + if args.G4NN == None:
  119 + args.G4NN = 0.5
  120 + return args
  121 +
  122 +def main():
  123 + args = arguments()
  124 + g4rna_frame = pd.read_csv(args.tsv, sep='\t', index_col=0)
  125 +# 10,
  126 + merge_g4rna(g4rna_frame,
  127 + 60, 10,
  128 + args.cGcC, args.G4H, args.G4NN)#.to_csv(
  129 +# path_or_buf=sys.stdout, sep='\t')
  130 +
  131 +if __name__ == '__main__':
  132 + main()
... ...
screen.py
... ... @@ -16,10 +16,12 @@
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18  
19   -from g4base import *
20 19 import os
21 20 import sys
  21 +import pickle
22 22 import argparse
  23 +import utils
  24 +import g4base
23 25  
24 26 def apply_network(ann,
25 27 fasta,
... ... @@ -46,16 +48,19 @@ def apply_network(ann,
46 48 columns_to_drop.append(essential)
47 49 # manage files and stings differently using adapted fasta fetcher
48 50 if type(fasta) == type(''):
49   - RNome_df = gen_G4RNA_df(fasta_str_fetcher(fasta, verbose=verbose),
  51 + RNome_df = g4base.gen_G4RNA_df(utils.fasta_str_fetcher(fasta,
  52 + verbose=verbose),
50 53 columns, 1, int(wdw_len), int(wdw_step), verbose=verbose)
51 54 else:
52   - RNome_df = gen_G4RNA_df(fasta_fetcher(fasta, 0, 0, verbose=verbose),
  55 + RNome_df = g4base.gen_G4RNA_df(
  56 + utils.fasta_fetcher(fasta, 0, 0, verbose=verbose),
53 57 columns, 1, int(wdw_len), int(wdw_step), verbose=verbose)
54 58 # only loads ANN and trimer_transfo when G4NN is in columns
55 59 if 'G4NN' in columns:
56 60 ann = pickle.load(ann)
57   - RNome_trans_df = trimer_transfo(RNome_df, 'sequence', verbose=verbose)
58   - RNome_df = submit_seq(ann, RNome_trans_df.drop('G4NN',axis=1),
  61 + RNome_trans_df = utils.trimer_transfo(RNome_df, 'sequence',
  62 + verbose=verbose)
  63 + RNome_df = g4base.submit_seq(ann, RNome_trans_df.drop('G4NN',axis=1),
59 64 [c for c in columns if c != 'G4NN'], "G4NN",
60 65 verbose=verbose)
61 66 # write bedgraph header in stdout if -b --bedgraph in arguments
... ... @@ -207,40 +212,12 @@ def legacy_main():
207 212 else:
208 213 screen_usage(50, 'An option is missing, incorrect or not authorized')
209 214  
210   -class Formatter(argparse.HelpFormatter):
211   - """
212   - Extended HelpFormatter class in order to correct the greediness of --columns
213   - that includes the last positional argument. This extension of HelpFormatter
214   - brings the positional argument to the beginning of the command and the
215   - optonal arguments are send to the end.
216   -
217   - This snippet of code was adapted from user "hpaulj" from StackOverflow.
218   - """
219   - # use defined argument order to display usage
220   - def _format_usage(self, usage, actions, groups, prefix):
221   - if prefix is None:
222   - prefix = 'usage: '
223   - # if usage is specified, use that
224   - if usage is not None:
225   - usage = usage % dict(prog=self._prog)
226   - # if no optionals or positionals are available, usage is just prog
227   - elif usage is None and not actions:
228   - usage = '%(prog)s' % dict(prog=self._prog)
229   - elif usage is None:
230   - prog = '%(prog)s' % dict(prog=self._prog)
231   - # build full usage string
232   - action_usage = self._format_actions_usage(actions, groups) # NEW
233   - usage = ' '.join([s for s in [prog, action_usage] if s])
234   - # omit the long line wrapping code
235   - # prefix with 'usage:'
236   - return '%s%s\n\n' % (prefix, usage)
237   -
238 215 def arguments():
239 216 """
240 217 Arguments management
241 218 """
242 219 # declare argument parser using the above adapted HelpFormatter
243   - parser = argparse.ArgumentParser(formatter_class=Formatter,
  220 + parser = argparse.ArgumentParser(formatter_class=utils.Formatter,
244 221 prog=os.path.basename(__file__),
245 222 description="Identification of potential RNA G-quadruplexes",
246 223 epilog="G4RNA screener Copyright (C) 2018 Jean-Michel Garant "\
... ...
utils.py
... ... @@ -18,11 +18,39 @@
18 18  
19 19 import sys
20 20 import regex
21   -import pickle
  21 +import argparse
22 22 import pandas as pd
23 23 import numpy as np
24 24 from collections import Counter, OrderedDict
25 25  
  26 +class Formatter(argparse.HelpFormatter):
  27 + """
  28 + Extended HelpFormatter class in order to correct the greediness of --columns
  29 + that includes the last positional argument. This extension of HelpFormatter
  30 + brings the positional argument to the beginning of the command and the
  31 + optonal arguments are send to the end.
  32 +
  33 + This snippet of code was adapted from user "hpaulj" from StackOverflow.
  34 + """
  35 + # use defined argument order to display usage
  36 + def _format_usage(self, usage, actions, groups, prefix):
  37 + if prefix is None:
  38 + prefix = 'usage: '
  39 + # if usage is specified, use that
  40 + if usage is not None:
  41 + usage = usage % dict(prog=self._prog)
  42 + # if no optionals or positionals are available, usage is just prog
  43 + elif usage is None and not actions:
  44 + usage = '%(prog)s' % dict(prog=self._prog)
  45 + elif usage is None:
  46 + prog = '%(prog)s' % dict(prog=self._prog)
  47 + # build full usage string
  48 + action_usage = self._format_actions_usage(actions, groups) # NEW
  49 + usage = ' '.join([s for s in [prog, action_usage] if s])
  50 + # omit the long line wrapping code
  51 + # prefix with 'usage:'
  52 + return '%s%s\n\n' % (prefix, usage)
  53 +
26 54 def verbosify(verbose, message, flush=False):
27 55 """
28 56 Take care of the verbosity for the user.
... ...