Commit 644db29e0bfaf69e9630587d1a5cee6159f80af0

Authored by Jean-Michel Garant
1 parent 7028e348
Exists in stable-0.3 and in 1 other branch master

new main included, legacy_main commented

Showing 3 changed files with 78 additions and 78 deletions   Show diff stats
README.md
... ... @@ -39,6 +39,8 @@ python2.7 (2.7.12)
39 39  
40 40 ##### From Python
41 41  
  42 +Available from pip
  43 +
42 44 ```bash
43 45 biopython (1.68)
44 46 numpy (1.11.0)
... ... @@ -53,9 +55,9 @@ scipy (0.18.1)
53 55 > -c, --columns arguments. It is not available through pip but here are the
54 56 > steps to follow to install it:
55 57 > ```bash
56   -> cd PATH/TO/PYTHON/dist-packages/ or cd PATH/TO/PYTHON/site-packages/
57 58 > sudo -i
58   -> wget https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.4.tar.gz
  59 +> cd PATH/TO/PYTHON/dist-packages/ or cd PATH/TO/PYTHON/site-packages/
  60 +> wget https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.4.tar.gz --no-check-certificate
59 61 > tar -xzf mysql-connector-python-2.1.4.tar.gz
60 62 > cd mysql-connector-python-2.1.4
61 63 > python setup.py install
... ...
screen.py
... ... @@ -29,64 +29,57 @@ def apply_network(ann,
29 29 bedgraph=None,
30 30 verbose=False):
31 31 """
32   - Wrapping function
33   - Uses a provided ANN in pickled format (.pkl) and retrieves a dataframe of
34   - sequences to obtain their G4NN score. Saves the complete values in a .csv
35   - file.
36   -
37   - Wrapping functions don't return values but combine functions to achieve
38   - something.
  32 + Apply the ANN object to the sequences given in a fasta file or fasta string
39 33 """
  34 + # define columns in "all"
40 35 if "all" in columns:
41 36 columns = ['gene_symbol','mrnaAcc','protAcc','gene_stable_id',
42 37 'transcript_stable_id','full_name','HGNC_id','identifier',
43 38 'source','genome_assembly','chromosome','start','end','strand',
44 39 'length','sequence','cGcC','G4H','G4NN']
45   - #else:
46   - # columns = regex.split(",", columns.strip("[]"))
47 40 columns_to_drop = []
  41 + # three columns are essentials
  42 + # they are created and dropped if wasn't included in the user request
48 43 for essential in ['length', 'sequence', 'g4']:
49 44 if essential not in columns:
50 45 columns.append(essential)
51 46 columns_to_drop.append(essential)
52   - if str(fasta)[0] == '>':
  47 + # manage files and stings differently using adapted fasta fetcher
  48 + if type(fasta) == type(''):
53 49 RNome_df = gen_G4RNA_df(fasta_str_fetcher(fasta, verbose=verbose),
54 50 columns, 1, int(wdw_len), int(wdw_step), verbose=verbose)
55   - elif str(fasta)[-3:] == '.fa'\
56   - or str(fasta)[-4:] in ['.fas', '.txt']\
57   - or str(fasta)[-6:] == '.fasta'\
58   - or fasta == "/dev/stdin":
  51 + else:
59 52 RNome_df = gen_G4RNA_df(fasta_fetcher(fasta, 0, 0, verbose=verbose),
60 53 columns, 1, int(wdw_len), int(wdw_step), verbose=verbose)
61   - else:
62   - screen_usage(52, 'fasta input not specified or not supported')
  54 + # only loads ANN and trimer_transfo when G4NN is in columns
63 55 if 'G4NN' in columns:
64   - # network_file = open(ann,'r')
65 56 ann = pickle.load(ann)
66   - # network_file.close()
67 57 RNome_trans_df = trimer_transfo(RNome_df, 'sequence', verbose=verbose)
68   -# RNome_trans_df = kmer_transfo(RNome_df, 3, 'length', 'sequence', 'g4',
69   -# int(wdw_len), jellyfish=False, overlapped=True,
70   -# verbose=verbose)
71 58 RNome_df = submit_seq(ann, RNome_trans_df.drop('G4NN',axis=1),
72 59 [c for c in columns if c != 'G4NN'], "G4NN",
73 60 verbose=verbose)
  61 + # write bedgraph header in stdout if -b --bedgraph in arguments
  62 + # the browser initial position will cover the first chromosome supplied in
  63 + # fasta from the minimal position to maximal position
74 64 if bedgraph:
75 65 sys.stdout.write('browser position %s:%d-%d\n'%(
76   - RNome_df['chromosome'].iloc[0],
  66 + RNome_df['chromosome'].dropna().iloc[0],
77 67 RNome_df[
78 68 RNome_df.chromosome == RNome_df[
79   - 'chromosome'].iloc[0]].start.min(),
  69 + 'chromosome'].dropna().iloc[0]].start.min(),
80 70 RNome_df[
81 71 RNome_df.chromosome == RNome_df[
82   - 'chromosome'].iloc[0]].end.max()))
83   - sys.stdout.write('track type=bedGraph name=%s visibility=full \
84   -color=200,100,0\n'%RNome_df.drop(columns_to_drop, axis=1).columns[-1])
  72 + 'chromosome'].dropna().iloc[0]].end.max()))
  73 + sys.stdout.write('track type=bedGraph name=%s visibility=full '\
  74 + 'color=200,0,0\n'%RNome_df.drop(
  75 + columns_to_drop, axis=1).columns[-1])
85 76 return RNome_df.drop(columns_to_drop, axis=1)
86 77  
87 78 def screen_usage(error_value=False, error_message=False):
88 79 """
89 80 Provide the user with instructions to use screen.py.
  81 +
  82 + DEPRECATED
90 83 """
91 84 print "Usage: PATH/TO/screen.py [OPTIONS...]"
92 85 print "Use -? or --help to show this message"
... ... @@ -134,12 +127,12 @@ def screen_usage(error_value=False, error_message=False):
134 127 else:
135 128 sys.exit(0)
136 129  
137   -def main():
  130 +def legacy_main():
138 131 """
139 132 Handles arguments.
  133 +
  134 + DEPRECATED
140 135 """
141   - global start_time
142   - start_time = time.time()
143 136 #Default values here in option_dict
144 137 option_dict = {"--columns":"description,sequence,start,cGcC,G4H,G4NN",
145 138 "--ann":os.path.dirname(__file__)+"/G4RNA_2016-11-07.pkl",
... ... @@ -215,15 +208,21 @@ def main():
215 208 screen_usage(50, 'An option is missing, incorrect or not authorized')
216 209  
217 210 class Formatter(argparse.HelpFormatter):
  211 + """
  212 + Extended HelpFormatter class in order to correct the greediness of --columns
  213 + that includes the last positional argument. This extension of HelpFormatter
  214 + brings the positional argument to the beginning of the command and the
  215 + optonal arguments are send to the end.
  216 +
  217 + This snippet of code was adapted from user "hpaulj" from StackOverflow.
  218 + """
218 219 # use defined argument order to display usage
219 220 def _format_usage(self, usage, actions, groups, prefix):
220 221 if prefix is None:
221 222 prefix = 'usage: '
222   -
223 223 # if usage is specified, use that
224 224 if usage is not None:
225 225 usage = usage % dict(prog=self._prog)
226   -
227 226 # if no optionals or positionals are available, usage is just prog
228 227 elif usage is None and not actions:
229 228 usage = '%(prog)s' % dict(prog=self._prog)
... ... @@ -238,38 +237,48 @@ class Formatter(argparse.HelpFormatter):
238 237  
239 238 def arguments():
240 239 """
241   - Handles arguments
  240 + Arguments management
242 241 """
  242 + # declare argument parser using the above adapted HelpFormatter
243 243 parser = argparse.ArgumentParser(formatter_class=Formatter,
244   - prog="screen.py",
  244 + prog=os.path.basename(__file__),
245 245 description="Identification of potential RNA G-quadruplexes",
246 246 epilog="G4RNA screener Copyright (C) 2018 Jean-Michel Garant "\
247 247 "This program comes with ABSOLUTELY NO WARRANTY. This is free "\
248 248 "software, and you are welcome to redistribute it under certain "\
249 249 "conditions <http://www.gnu.org/licenses/>.")
  250 + # FASTA input from STDIN is supported by default using argument "-"
250 251 parser.add_argument('FASTA',
251   - type=str,
252   - help='FASTA file (.fa .fas)')
  252 + type=argparse.FileType('r'),
  253 + default=sys.stdin,
  254 + help='FASTA file (.fa .fas .fasta), - for default STDIN')
  255 + # the .pkl file is a trained pybrain ANN object that have been saved in a
  256 + # readable format. Read about pickle package to know more
253 257 parser.add_argument("-a", "--ann",
254 258 type=argparse.FileType('r'),
255 259 default=os.path.dirname(__file__)+"/G4RNA_2016-11-07.pkl",
256 260 help="Supply a picled ANN (default: G4RNA_2016-11-07.pkl)")
  261 + # length of segmentation of long sequences into analysis windows
257 262 parser.add_argument("-w", "--window",
258 263 type=int,
259 264 default=60,
260 265 help="Window length (default: 60)",
261 266 metavar="INT")
  267 + # step in between each overlapping windows. small steps means more
  268 + # resolution but higher computational time
262 269 parser.add_argument("-s", "--step",
263 270 type=int,
264 271 default=10,
265 272 help="Step length between windows (default: 10)",
266 273 metavar="INT")
  274 + # bedgraph will generate the required header, compatible UCSC genome browser
267 275 parser.add_argument("-b", "--bedgraph",
268 276 action="store_true",
269 277 default=False,
270 278 help="Display output as BedGraph, user must provides columns")
271 279 ## TODO use choices of three scores as bedgraph options which will
272 280 ## select columns for the user, must include verifications
  281 + # columns to generate are provided by a space delimited list
273 282 parser.add_argument("-c", "--columns",
274 283 nargs="+",
275 284 choices=["list",
... ... @@ -297,26 +306,29 @@ def arguments():
297 306 "G4NN",
298 307 ],
299 308 default=["description","sequence","start","cGcC","G4H","G4NN"],
300   - help="Columns to display (default: description). "\
  309 + help="Columns to display (default: description sequence start "\
  310 + "cGcC G4H G4NN). "\
301 311 "To browse available columns use: -c list",
302 312 metavar="")
  313 + # verbose option is very rudimental
303 314 parser.add_argument("-v", "--verbose",
304 315 action="store_true",
305 316 default=False,
306   - help="Verbose output with timed operations")
  317 + help="Verbose output with operations when completed")
  318 + # useful for debug, not meant for users
307 319 parser.add_argument("-e", "--error",
308 320 action="store_true",
309 321 default=False,
310 322 help="Raise errors and exceptions")
311   -
312 323 return parser
313 324  
314   -def to_replace_main():
  325 +def main():
315 326 """
316 327 Functions calls
317 328 """
318 329 parser = arguments()
319 330 args = parser.parse_args()
  331 + # custom help message to list columns choices
320 332 if args.columns == ["list"]:
321 333 splitted_help = parser.format_help().split(
322 334 ". To browse available columns use:\n\
... ... @@ -347,15 +359,18 @@ def to_replace_main():
347 359 "G4NN \t\tG4NN score of similitude",
348 360 " \t\t(must be specified to use ANN)",
349 361 splitted_help[1]]))
350   - if args.bedgraph and len(args.columns) != 4 and args.columns[-1] not in [
351   - 'cGcC', 'G4H', 'G4NN']:
  362 + # restrictive verifications for bedgraph options
  363 + if args.bedgraph and (
  364 + len(args.columns) != 4\
  365 + or args.columns[0:3] != ['chromosome','start','end']\
  366 + or args.columns[-1] not in ['cGcC', 'G4H', 'G4NN']):
352 367 parser.print_usage()
353 368 sys.stderr.write(parser.prog+': error: '\
354   - 'BedGraph format requires 4 columns: '\
355   - 'chromosome start end [SCORE]\n'\
  369 + 'BedGraph format requires 4 ordered columns: '\
  370 + 'chromosome start end [SCORE] '\
356 371 'where [SCORE] is either cGcC, G4H or G4NN\n')
357 372 sys.exit()
358   - print args
  373 + # run in a try/except to generate a custom error message
359 374 try:
360 375 apply_network(args.ann,
361 376 args.FASTA,
... ... @@ -369,12 +384,16 @@ def to_replace_main():
369 384 index=(args.bedgraph==False),
370 385 header=(args.bedgraph==False))
371 386 except:
  387 + # raise python error calls if -e --error is used
372 388 if args.error:
373 389 raise
  390 + # custom error message
374 391 else:
375   - screen_usage(50, 'An option is missing, incorrect or not authorized')
  392 + parser.print_usage()
  393 + sys.stderr.write(parser.prog+': error: '\
  394 + 'An option is missing, incorrect or not authorized\n')
376 395  
377 396  
378 397 if __name__ == '__main__':
379   -# main()
380   - to_replace_main()
  398 + main()
  399 +# legacy_main()
... ...
utils.py
... ... @@ -16,7 +16,6 @@
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18  
19   -import time
20 19 import sys
21 20 import regex
22 21 import pickle
... ... @@ -24,40 +23,20 @@ import pandas as pd
24 23 import numpy as np
25 24 from collections import Counter, OrderedDict
26 25  
27   -
28   -def hms_string(time_elapsed):
29   - """
30   - Format time in hours, minutes, seconds.
31   -
32   - Must be included in a string to format a time value.
33   - """
34   - h = int(time_elapsed / (60 * 60))
35   - m = int((time_elapsed % (60 * 60)) / 60)
36   - s = time_elapsed % 60.
37   - return "{}:{:>02}:{:>05.2f}".format(h, m, s)
38   -
39   -def verbosify(verbose, message, time_it=False):
  26 +def verbosify(verbose, message, flush=False):
40 27 """
41 28 Take care of the verbosity for the user.
42 29  
43   -****Time_it requires a global variable -> start_time****
44   -
45 30 Supports both Boolean value of verbose and numerical level of verbose.
46 31 Either print or flush message.
47 32 """
48   - if verbose == None:
  33 + if verbose == False or verbose == 0:
49 34 pass
50   - elif (verbose == True or verbose > 0) and time_it == True:
51   - print message+"\t"*(4-len(message)/8)+\
52   - "{}".format(hms_string(time.time() - start_time))
53   - elif (verbose == True or verbose > 0) and time_it == False:
54   - print message
55   - else:
56   - if "\n" in message:
57   - print message
58   - else:
59   - sys.stdout.write(message+"..."+" "*(77-len(message))+"\r")
60   - sys.stdout.flush()
  35 + elif (verbose == True or verbose > 0) and flush == False:
  36 + sys.stdout.write(message+"\n")
  37 + elif flush == True:
  38 + sys.stdout.write(message+"..."+" "*(77-len(message))+"\r")
  39 + sys.stdout.flush()
61 40  
62 41 def connect_psql(host, user, passwd, db, number_of_cursor):# schema,
63 42 """
... ... @@ -156,7 +135,7 @@ def retrieve_xref_Ensembl(stable_id=None,mrnaAcc=None,
156 135 'AND value = "%s"'%gene_acronym)
157 136 return list(cursor.fetchone())
158 137  
159   -def format_description(fas_description, verbose=None):
  138 +def format_description(fas_description, verbose=False):
160 139 '''
161 140 Takes a fasta description line and try to retrieve informations out of it
162 141 if it has a known format.
... ...