Commit afd0c15b65128539f6a02e96c915630294cdb75b

Authored by Jean-Michel Garant
1 parent 31b2e166
Exists in stable-0.3 and in 1 other branch master

merge initial

Showing 1 changed file with 132 additions and 0 deletions   Show diff stats
merge.py 0 → 100755
... ... @@ -0,0 +1,132 @@
  1 +#!/usr/bin/env python
  2 +
  3 +# Identification of potential RNA G-quadruplexes by G4RNA screener.
  4 +# Copyright (C) 2018 Jean-Michel Garant
  5 +#
  6 +# This program is free software: you can redistribute it and/or modify
  7 +# it under the terms of the GNU General Public License as published by
  8 +# the Free Software Foundation, either version 3 of the License, or
  9 +# (at your option) any later version.
  10 +#
  11 +# This program is distributed in the hope that it will be useful,
  12 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 +# GNU General Public License for more details.
  15 +#
  16 +# You should have received a copy of the GNU General Public License
  17 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 +
  19 +import os
  20 +import sys
  21 +import utils
  22 +import argparse
  23 +import pandas as pd
  24 +
  25 +class float_range(object):
  26 + """
  27 + Object that defines a range of float that is authorized or returns the
  28 + authorized range in error. Used to validate score threshold input by user.
  29 + """
  30 + def __init__(self, start, end):
  31 + self.start = start
  32 + self.end = end
  33 + def __eq__(self, other):
  34 + if self.start == None:
  35 + return other <= self.end
  36 + elif self.end == None:
  37 + return self.start <= other
  38 + else:
  39 + return self.start <= other <= self.end
  40 + def __repr__(self):
  41 + return '[{0}:{1}]'.format(self.start, self.end)
  42 +
  43 +def custom(sub_df):
  44 + return list(sub_df.sequence)
  45 + #return len(''.join(sub_df.sequence))
  46 +
  47 +def merge_g4rna(df, window=60, step=10,
  48 + cGcC=False, G4H=False, G4NN=False):
  49 +# df = df.groupby(lambda x: group_func(df, x-1),
  50 +# as_index=True, sort=False
  51 +# )['start','sequence','cGcC','G4H','G4NN'].apply(
  52 +# custom).reset_index()
  53 +# df = df.apply(lambda x: annotate(df, x), axis=1)
  54 + if 'start' in df.columns:
  55 + df['start_match'] = (
  56 + df.start.eq(df.start.shift()+step) |
  57 + df.start.shift(-1).eq(df.start+step))
  58 + if 'sequence' in df.columns:
  59 + df['seq_match'] = (
  60 + df.sequence.str[-50:].eq(df.sequence.str[:50].shift(-1)) |
  61 + df.sequence.str[-50:].shift().eq(df.sequence.str[:50]))
  62 +# df = df.groupby(['start_match','seq_match'])
  63 + print df
  64 +
  65 + #return df
  66 +
  67 +def arguments():
  68 + """
  69 + Arguments management
  70 + """
  71 + # declare argument parser
  72 + parser = argparse.ArgumentParser(formatter_class=utils.Formatter,
  73 + prog=os.path.basename(__file__),
  74 + description="Merge positive windows of screen.py output and "\
  75 + "discard windows below the threshold(s)",
  76 + epilog="G4RNA screener Copyright (C) 2018 Jean-Michel Garant "\
  77 + "This program comes with ABSOLUTELY NO WARRANTY. This is free "\
  78 + "software, and you are welcome to redistribute it under certain "\
  79 + "conditions <http://www.gnu.org/licenses/>.")
  80 + # TSV input from STDIN is supported by default using argument "-"
  81 + parser.add_argument('tsv',
  82 + type=argparse.FileType('r'),
  83 + default=sys.stdin,
  84 + help='TSV file (tab separated values), - for default STDIN',
  85 + metavar='TSV')
  86 + # Use cGcC
  87 + parser.add_argument("--cGcC",
  88 + type=float,
  89 + nargs='?',
  90 + choices=[float_range(0,None)],
  91 + default=False,
  92 + help="Use cGcC score threshold to determine positive windows "\
  93 + "(default: 4.5)",
  94 + metavar="FLOAT")
  95 + # Use G4H
  96 + parser.add_argument("--G4H",
  97 + type=float,
  98 + nargs='?',
  99 + choices=[float_range(-4,4)],
  100 + default=False,
  101 + help="Use G4Hunter score threshold to determine positive windows "\
  102 + "(default: 0.9)",
  103 + metavar="FLOAT")
  104 + # Use G4NN
  105 + parser.add_argument("--G4NN",
  106 + type=float,
  107 + nargs='?',
  108 + choices=[float_range(0,1)],
  109 + default=False,
  110 + help="Use G4NN score threshold to determine positive windows "\
  111 + "(default: 0.5)",
  112 + metavar="FLOAT")
  113 + args = parser.parse_args()
  114 + if args.cGcC == None:
  115 + args.cGcC = 4.5
  116 + if args.G4H == None:
  117 + args.G4H = 0.9
  118 + if args.G4NN == None:
  119 + args.G4NN = 0.5
  120 + return args
  121 +
  122 +def main():
  123 + args = arguments()
  124 + g4rna_frame = pd.read_csv(args.tsv, sep='\t', index_col=0)
  125 +# 10,
  126 + merge_g4rna(g4rna_frame,
  127 + 60, 10,
  128 + args.cGcC, args.G4H, args.G4NN)#.to_csv(
  129 +# path_or_buf=sys.stdout, sep='\t')
  130 +
  131 +if __name__ == '__main__':
  132 + main()
... ...