Compare View

switch
from
...
to
 
Commits (2)
README.md
... ... @@ -39,6 +39,8 @@ python2.7 (2.7.12)
39 39  
40 40 ##### From Python
41 41  
  42 +Available from pip
  43 +
42 44 ```bash
43 45 biopython (1.68)
44 46 numpy (1.11.0)
... ... @@ -53,9 +55,9 @@ scipy (0.18.1)
53 55 > -c, --columns arguments. It is not available through pip but here are the
54 56 > steps to follow to install it:
55 57 > ```bash
56   -> cd PATH/TO/PYTHON/dist-packages/ or cd PATH/TO/PYTHON/site-packages/
57 58 > sudo -i
58   -> wget https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.4.tar.gz
  59 +> cd PATH/TO/PYTHON/dist-packages/ or cd PATH/TO/PYTHON/site-packages/
  60 +> wget https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.4.tar.gz --no-check-certificate
59 61 > tar -xzf mysql-connector-python-2.1.4.tar.gz
60 62 > cd mysql-connector-python-2.1.4
61 63 > python setup.py install
... ...
screen.py
... ... @@ -18,6 +18,7 @@
18 18  
19 19 from g4base import *
20 20 import os
  21 +import sys
21 22 import argparse
22 23  
23 24 def apply_network(ann,
... ... @@ -28,64 +29,57 @@ def apply_network(ann,
28 29 bedgraph=None,
29 30 verbose=False):
30 31 """
31   - Wrapping function
32   - Uses a provided ANN in pickled format (.pkl) and retrieves a dataframe of
33   - sequences to obtain their G4NN score. Saves the complete values in a .csv
34   - file.
35   -
36   - Wrapping functions don't return values but combine functions to achieve
37   - something.
  32 + Apply the ANN object to the sequences given in a fasta file or fasta string
38 33 """
  34 + # define columns in "all"
39 35 if "all" in columns:
40 36 columns = ['gene_symbol','mrnaAcc','protAcc','gene_stable_id',
41 37 'transcript_stable_id','full_name','HGNC_id','identifier',
42 38 'source','genome_assembly','chromosome','start','end','strand',
43 39 'length','sequence','cGcC','G4H','G4NN']
44   - else:
45   - columns = regex.split(",", columns.strip("[]"))
46 40 columns_to_drop = []
  41 + # three columns are essentials
  42 + # they are created and dropped if wasn't included in the user request
47 43 for essential in ['length', 'sequence', 'g4']:
48 44 if essential not in columns:
49 45 columns.append(essential)
50 46 columns_to_drop.append(essential)
51   - if str(fasta)[0] == '>':
  47 + # manage files and stings differently using adapted fasta fetcher
  48 + if type(fasta) == type(''):
52 49 RNome_df = gen_G4RNA_df(fasta_str_fetcher(fasta, verbose=verbose),
53 50 columns, 1, int(wdw_len), int(wdw_step), verbose=verbose)
54   - elif str(fasta)[-3:] == '.fa'\
55   - or str(fasta)[-4:] in ['.fas', '.txt']\
56   - or str(fasta)[-6:] == '.fasta'\
57   - or fasta == "/dev/stdin":
  51 + else:
58 52 RNome_df = gen_G4RNA_df(fasta_fetcher(fasta, 0, 0, verbose=verbose),
59 53 columns, 1, int(wdw_len), int(wdw_step), verbose=verbose)
60   - else:
61   - screen_usage(52, 'fasta input not specified or not supported')
  54 + # only loads ANN and trimer_transfo when G4NN is in columns
62 55 if 'G4NN' in columns:
63   - network_file = open(ann,'r')
64   - ann = pickle.load(network_file)
65   - network_file.close()
  56 + ann = pickle.load(ann)
66 57 RNome_trans_df = trimer_transfo(RNome_df, 'sequence', verbose=verbose)
67   -# RNome_trans_df = kmer_transfo(RNome_df, 3, 'length', 'sequence', 'g4',
68   -# int(wdw_len), jellyfish=False, overlapped=True,
69   -# verbose=verbose)
70 58 RNome_df = submit_seq(ann, RNome_trans_df.drop('G4NN',axis=1),
71 59 [c for c in columns if c != 'G4NN'], "G4NN",
72 60 verbose=verbose)
  61 + # write bedgraph header in stdout if -b --bedgraph in arguments
  62 + # the browser initial position will cover the first chromosome supplied in
  63 + # fasta from the minimal position to maximal position
73 64 if bedgraph:
74 65 sys.stdout.write('browser position %s:%d-%d\n'%(
75   - RNome_df['chromosome'].iloc[0],
  66 + RNome_df['chromosome'].dropna().iloc[0],
76 67 RNome_df[
77 68 RNome_df.chromosome == RNome_df[
78   - 'chromosome'].iloc[0]].start.min(),
  69 + 'chromosome'].dropna().iloc[0]].start.min(),
79 70 RNome_df[
80 71 RNome_df.chromosome == RNome_df[
81   - 'chromosome'].iloc[0]].end.max()))
82   - sys.stdout.write('track type=bedGraph name=%s visibility=full \
83   -color=200,100,0\n'%RNome_df.drop(columns_to_drop, axis=1).columns[-1])
  72 + 'chromosome'].dropna().iloc[0]].end.max()))
  73 + sys.stdout.write('track type=bedGraph name=%s visibility=full '\
  74 + 'color=200,0,0\n'%RNome_df.drop(
  75 + columns_to_drop, axis=1).columns[-1])
84 76 return RNome_df.drop(columns_to_drop, axis=1)
85 77  
86 78 def screen_usage(error_value=False, error_message=False):
87 79 """
88 80 Provide the user with instructions to use screen.py.
  81 +
  82 + DEPRECATED
89 83 """
90 84 print "Usage: PATH/TO/screen.py [OPTIONS...]"
91 85 print "Use -? or --help to show this message"
... ... @@ -133,12 +127,12 @@ def screen_usage(error_value=False, error_message=False):
133 127 else:
134 128 sys.exit(0)
135 129  
136   -def main():
  130 +def legacy_main():
137 131 """
138 132 Handles arguments.
  133 +
  134 + DEPRECATED
139 135 """
140   - global start_time
141   - start_time = time.time()
142 136 #Default values here in option_dict
143 137 option_dict = {"--columns":"description,sequence,start,cGcC,G4H,G4NN",
144 138 "--ann":os.path.dirname(__file__)+"/G4RNA_2016-11-07.pkl",
... ... @@ -213,16 +207,22 @@ def main():
213 207 else:
214 208 screen_usage(50, 'An option is missing, incorrect or not authorized')
215 209  
216   -class Formatter(argparse.ArgumentDefaultsHelpFormatter):
  210 +class Formatter(argparse.HelpFormatter):
  211 + """
  212 + Extended HelpFormatter class in order to correct the greediness of --columns
  213 + that includes the last positional argument. This extension of HelpFormatter
  214 + brings the positional argument to the beginning of the command and the
  215 + optonal arguments are send to the end.
  216 +
  217 + This snippet of code was adapted from user "hpaulj" from StackOverflow.
  218 + """
217 219 # use defined argument order to display usage
218 220 def _format_usage(self, usage, actions, groups, prefix):
219 221 if prefix is None:
220 222 prefix = 'usage: '
221   -
222 223 # if usage is specified, use that
223 224 if usage is not None:
224 225 usage = usage % dict(prog=self._prog)
225   -
226 226 # if no optionals or positionals are available, usage is just prog
227 227 elif usage is None and not actions:
228 228 usage = '%(prog)s' % dict(prog=self._prog)
... ... @@ -237,39 +237,54 @@ class Formatter(argparse.ArgumentDefaultsHelpFormatter):
237 237  
238 238 def arguments():
239 239 """
240   - Handles arguments
  240 + Arguments management
241 241 """
  242 + # declare argument parser using the above adapted HelpFormatter
242 243 parser = argparse.ArgumentParser(formatter_class=Formatter,
243   - prog="screen.py",
  244 + prog=os.path.basename(__file__),
244 245 description="Identification of potential RNA G-quadruplexes",
245 246 epilog="G4RNA screener Copyright (C) 2018 Jean-Michel Garant "\
246 247 "This program comes with ABSOLUTELY NO WARRANTY. This is free "\
247 248 "software, and you are welcome to redistribute it under certain "\
248 249 "conditions <http://www.gnu.org/licenses/>.")
  250 + # FASTA input from STDIN is supported by default using argument "-"
249 251 parser.add_argument('FASTA',
250 252 type=argparse.FileType('r'),
251   - help='FASTA file (.fa .fas)')
  253 + default=sys.stdin,
  254 + help='FASTA file (.fa .fas .fasta), - for default STDIN')
  255 + # the .pkl file is a trained pybrain ANN object that have been saved in a
  256 + # readable format. Read about pickle package to know more
252 257 parser.add_argument("-a", "--ann",
253 258 type=argparse.FileType('r'),
254 259 default=os.path.dirname(__file__)+"/G4RNA_2016-11-07.pkl",
255   - help="Supply a picled ANN (.pkl format)")
  260 + help="Supply a picled ANN (default: G4RNA_2016-11-07.pkl)")
  261 + # length of segmentation of long sequences into analysis windows
256 262 parser.add_argument("-w", "--window",
257 263 type=int,
258 264 default=60,
259   - help="Window length",
  265 + help="Window length (default: 60)",
260 266 metavar="INT")
  267 + # step in between each overlapping windows. small steps means more
  268 + # resolution but higher computational time
261 269 parser.add_argument("-s", "--step",
262 270 type=int,
263 271 default=10,
264   - help="Step length between windows",
  272 + help="Step length between windows (default: 10)",
265 273 metavar="INT")
  274 + # bedgraph will generate the required header, compatible UCSC genome browser
266 275 parser.add_argument("-b", "--bedgraph",
267 276 action="store_true",
268 277 default=False,
269 278 help="Display output as BedGraph, user must provides columns")
  279 + ## TODO use choices of three scores as bedgraph options which will
  280 + ## select columns for the user, must include verifications
  281 + # columns to generate are provided by a space delimited list
270 282 parser.add_argument("-c", "--columns",
271 283 nargs="+",
272   - choices=["list", "all", "description", "gene_symbol",
  284 + choices=["list",
  285 + "all",
  286 + "description",
  287 + "gene_symbol",
273 288 "mrnaAcc",
274 289 "protAcc",
275 290 "gene_stable_id",
... ... @@ -290,27 +305,33 @@ def arguments():
290 305 "G4H",
291 306 "G4NN",
292 307 ],
293   - default="description",
294   - help="Columns to display. To browse available columns use: -c list",
  308 + default=["description","sequence","start","cGcC","G4H","G4NN"],
  309 + help="Columns to display (default: description sequence start "\
  310 + "cGcC G4H G4NN). "\
  311 + "To browse available columns use: -c list",
295 312 metavar="")
  313 + # verbose option is very rudimental
296 314 parser.add_argument("-v", "--verbose",
297 315 action="store_true",
298 316 default=False,
299   - help="Verbose output with timed operations")
  317 + help="Verbose output with operations when completed")
  318 + # useful for debug, not meant for users
300 319 parser.add_argument("-e", "--error",
301 320 action="store_true",
302 321 default=False,
303 322 help="Raise errors and exceptions")
304   -
305 323 return parser
306 324  
307   -def to_replace_main():
  325 +def main():
308 326 """
309 327 Functions calls
310 328 """
311   - args = arguments().parse_args()
  329 + parser = arguments()
  330 + args = parser.parse_args()
  331 + # custom help message to list columns choices
312 332 if args.columns == ["list"]:
313   - splitted_help = arguments().format_help().split(". To browse available columns use:\n\
  333 + splitted_help = parser.format_help().split(
  334 + ". To browse available columns use:\n\
314 335 -c list (default: description)")
315 336 print("\n\t".join([splitted_help[0],
316 337 "Available columns:",
... ... @@ -338,8 +359,41 @@ def to_replace_main():
338 359 "G4NN \t\tG4NN score of similitude",
339 360 " \t\t(must be specified to use ANN)",
340 361 splitted_help[1]]))
341   - print(args)
  362 + # restrictive verifications for bedgraph options
  363 + if args.bedgraph and (
  364 + len(args.columns) != 4\
  365 + or args.columns[0:3] != ['chromosome','start','end']\
  366 + or args.columns[-1] not in ['cGcC', 'G4H', 'G4NN']):
  367 + parser.print_usage()
  368 + sys.stderr.write(parser.prog+': error: '\
  369 + 'BedGraph format requires 4 ordered columns: '\
  370 + 'chromosome start end [SCORE] '\
  371 + 'where [SCORE] is either cGcC, G4H or G4NN\n')
  372 + sys.exit()
  373 + # run in a try/except to generate a custom error message
  374 + try:
  375 + apply_network(args.ann,
  376 + args.FASTA,
  377 + args.columns,
  378 + args.window,
  379 + args.step,
  380 + args.bedgraph,
  381 + args.verbose
  382 + ).to_csv(
  383 + path_or_buf=sys.stdout, sep='\t',
  384 + index=(args.bedgraph==False),
  385 + header=(args.bedgraph==False))
  386 + except:
  387 + # raise python error calls if -e --error is used
  388 + if args.error:
  389 + raise
  390 + # custom error message
  391 + else:
  392 + parser.print_usage()
  393 + sys.stderr.write(parser.prog+': error: '\
  394 + 'An option is missing, incorrect or not authorized\n')
  395 +
342 396  
343 397 if __name__ == '__main__':
344 398 main()
345   -# to_replace_main()
  399 +# legacy_main()
... ...
utils.py
... ... @@ -16,7 +16,6 @@
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18  
19   -import time
20 19 import sys
21 20 import regex
22 21 import pickle
... ... @@ -24,40 +23,20 @@ import pandas as pd
24 23 import numpy as np
25 24 from collections import Counter, OrderedDict
26 25  
27   -
28   -def hms_string(time_elapsed):
29   - """
30   - Format time in hours, minutes, seconds.
31   -
32   - Must be included in a string to format a time value.
33   - """
34   - h = int(time_elapsed / (60 * 60))
35   - m = int((time_elapsed % (60 * 60)) / 60)
36   - s = time_elapsed % 60.
37   - return "{}:{:>02}:{:>05.2f}".format(h, m, s)
38   -
39   -def verbosify(verbose, message, time_it=False):
  26 +def verbosify(verbose, message, flush=False):
40 27 """
41 28 Take care of the verbosity for the user.
42 29  
43   -****Time_it requires a global variable -> start_time****
44   -
45 30 Supports both Boolean value of verbose and numerical level of verbose.
46 31 Either print or flush message.
47 32 """
48   - if verbose == None:
  33 + if verbose == False or verbose == 0:
49 34 pass
50   - elif (verbose == True or verbose > 0) and time_it == True:
51   - print message+"\t"*(4-len(message)/8)+\
52   - "{}".format(hms_string(time.time() - start_time))
53   - elif (verbose == True or verbose > 0) and time_it == False:
54   - print message
55   - else:
56   - if "\n" in message:
57   - print message
58   - else:
59   - sys.stdout.write(message+"..."+" "*(77-len(message))+"\r")
60   - sys.stdout.flush()
  35 + elif (verbose == True or verbose > 0) and flush == False:
  36 + sys.stdout.write(message+"\n")
  37 + elif flush == True:
  38 + sys.stdout.write(message+"..."+" "*(77-len(message))+"\r")
  39 + sys.stdout.flush()
61 40  
62 41 def connect_psql(host, user, passwd, db, number_of_cursor):# schema,
63 42 """
... ... @@ -156,7 +135,7 @@ def retrieve_xref_Ensembl(stable_id=None,mrnaAcc=None,
156 135 'AND value = "%s"'%gene_acronym)
157 136 return list(cursor.fetchone())
158 137  
159   -def format_description(fas_description, verbose=None):
  138 +def format_description(fas_description, verbose=False):
160 139 '''
161 140 Takes a fasta description line and try to retrieve informations out of it
162 141 if it has a known format.
... ...