Commit 815b5ad47384900357ad97a5af36c684f60eda2b

Authored by Jean-Michel Garant
1 parent bbcf4c26
Exists in master

ANN train from fasta prototype

g4rna_screener
1   -Subproject commit 4d715e00130141aaa0bb03f1195a5aeab80cd885
  1 +Subproject commit 9d7ff5045fb4367abefbe24988e0b25f5e3fcfb1
... ...
sample_labelled.fas 0 → 100644
... ... @@ -0,0 +1,15 @@
  1 +>Telomeric repeat-containing RNA (TERRA)|label=1
  2 +UUAGGGUUAGGGUUAGGGUUAGGGUUAGGGUUAGGGUUAGGGUUAGGG
  3 +>NM_000633 chr18:63318709-63318733|label=1
  4 +GGGGGCCGUGGGGUGGGAGCUGGGG
  5 +>hg38_refGene_NM_002524.4 range=chr1:114716863-114716883 5'pad=0 3'pad=0 strand=- repeatMasking=none|label=1
  6 +UGUGGGAGGGGCGGGUCUGGG
  7 +>ENST00000265340 chr5:135028038:135028073:-1|label=1
  8 +AGCGGGCAGUGCGGGCCUGGCGGGAGGUGGGGGAGG
  9 +>Spinach aptamer (false negative example)|label=0
  10 +GGACGCGACCGAAAUGGUGAAGGACGGGUCCAGUGCGAAACACGCACUGU
  11 +UGAGUAGAGUGUGAGCUCCGUAACUGGUCGCGUC
  12 +>String of U (T are internally converted to U)|label=0
  13 +UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
  14 +>String of C (sequences are case insensitive)|label=0
  15 +cccccccccccccccccCCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccc
... ...
train_from_fasta.py 0 → 100755
... ... @@ -0,0 +1,281 @@
  1 +#!/usr/bin/env python2.7
  2 +
  3 +# Software to support development surrounding G4RNA screener.
  4 +# Copyright (C) 2019 Jean-Michel Garant
  5 +#
  6 +# This program is free software: you can redistribute it and/or modify
  7 +# it under the terms of the GNU General Public License as published by
  8 +# the Free Software Foundation, either version 3 of the License, or
  9 +# (at your option) any later version.
  10 +#
  11 +# This program is distributed in the hope that it will be useful,
  12 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 +# GNU General Public License for more details.
  15 +#
  16 +# You should have received a copy of the GNU General Public License
  17 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 +
  19 +import os
  20 +import sys
  21 +import argparse
  22 +import subprocess
  23 +import pickle
  24 +
  25 +from StringIO import StringIO
  26 +from pybrain.structure.modules import SwitchLayer,SigmoidLayer
  27 +from pybrain.datasets import ClassificationDataSet
  28 +from pybrain.tools.shortcuts import buildNetwork
  29 +from pybrain.supervised.trainers import RPropMinusTrainer
  30 +
  31 +#from rpy2.robjects import FloatVector, r
  32 +#from rpy2.robjects.packages import importr
  33 +#base = importr('base')
  34 +#ROCR = importr('ROCR')
  35 +
  36 +from g4rna_screener import utils
  37 +from g4rna_screener import g4base
  38 +
  39 +def roc_curve(values, targets):
  40 + """
  41 + Produces ROC curves to evaluate the performance of a predictive tool.
  42 +
  43 + Creates "Rplot001.svg" file of the ROC curve in the current directory.
  44 +
  45 + Returns a FloatVector.
  46 + [0] Area Under ROC curve (AUC)
  47 + [1] Partial Area Under ROC curve (pAUC), first half
  48 + """
  49 + rvalues = FloatVector(values)
  50 + rtargets = FloatVector(targets)
  51 + return r('''
  52 + roc <- function(values, targets) {
  53 + svg()
  54 + pred <- prediction(values, targets)
  55 + plot(performance(pred,"tpr","fpr"))
  56 + auc <- performance(pred,"auc")
  57 + pauc <- performance(pred,"auc",fpr.stop=0.5)
  58 + as.numeric(c(auc@y.values[[1]], pauc@y.values[[1]]))
  59 + }''')(rvalues, rtargets)
  60 +
  61 +def gen_trainset_df(
  62 + fasta_file,
  63 + label="|label=",
  64 + output="output",
  65 + verbose=False):
  66 + """
  67 + Builds a single neural network and pickles it.
  68 + """
  69 + trainset = g4base.gen_G4RNA_df(
  70 + utils.fasta_fetcher(
  71 + fasta_file,
  72 + 0,
  73 + 0,
  74 + verbose=verbose),
  75 + ['description','sequence'],
  76 + 0)
  77 + trainset[['description','g4']] = trainset.description.str.rsplit(
  78 + label,
  79 + n=1,
  80 + expand=True,)
  81 + trainset['length'] = trainset.sequence.str.len()
  82 + kmer_trainset = utils.kmer_transfo(trainset, 3, 'length', 'sequence', 'g4',
  83 + 111111)
  84 + return kmer_trainset
  85 +
  86 +def write_pkl(
  87 + ann,
  88 + output):
  89 + network_file = open(output+".pkl",'w')
  90 + pickle.dump(ann, network_file)
  91 + network_file.close()
  92 +
  93 +def build_neural_network(
  94 + df_trn,
  95 + except_columns,
  96 + hidden_nodes,
  97 + hid_class,
  98 + out_class,
  99 + verbose=False):
  100 + """
  101 + Build an ANN out of a training dataframe ignoring except_columns with
  102 + specified hidden nodes and layer classes.
  103 +
  104 + Return a list of a trainer with its neural network structure.
  105 + [0] trained neural network
  106 + [1] artificial neural network structure
  107 + """
  108 + alldata = ClassificationDataSet(len(df_trn.columns)-len(except_columns),
  109 + 1, nb_classes=2)
  110 + trans_set = df_trn.drop(except_columns, axis=1)
  111 + for n in df_trn.index:
  112 + if str(df_trn.loc[n].g4) in ['yes','True','1'] or df_trn.loc[n].g4 is True:
  113 + target = 1.
  114 + elif str(df_trn.loc[n].g4) in ['no','False','N/A','0'] or df_trn.loc[n].g4 is False:
  115 + target = 0.
  116 + else:
  117 + print '"%s" label not allowed'%df_trn.loc[n].g4, df_trn.loc[n].g4
  118 + sys.exit(1)
  119 + alldata.addSample(trans_set.loc[n][:], target)
  120 + alldata._convertToOneOfMany()
  121 + fnn = buildNetwork(alldata.indim, hidden_nodes, alldata.outdim,
  122 + hiddenclass=hid_class, outclass=out_class)
  123 + trainer= RPropMinusTrainer(fnn, dataset=alldata,
  124 + etaminus=0.6, etaplus=1.2, deltamin=1e-6)
  125 +# Following values can be tweaked to maximise training at the expanse of time.
  126 +# ContinueEpochs minimal recommended = 4
  127 +# maxEpochs minimal recommended = 1000
  128 + trainer.trainUntilConvergence(verbose=False, validationProportion=0.5,
  129 + continueEpochs=10, maxEpochs=10000)
  130 + trainer.testOnData(verbose=False)
  131 + utils.verbosify(verbose, "ANN built")
  132 + return [trainer, fnn]
  133 +
  134 +#def test_neural_network(
  135 +# df_tst,
  136 +# trainer,
  137 +# fnn,
  138 +# except_columns,
  139 +# verbose=False):
  140 +# """
  141 +# Test an ANN using a testing dataframe ignoring except_columns and produces
  142 +# ROC curves in ANN_roc/ directory.
  143 +#
  144 +# Return a list.
  145 +# [0] number of true positives
  146 +# [1] number of true negatives
  147 +# [2] number of false positives
  148 +# [3] number of false negatives
  149 +# [4] total of predictions
  150 +# [5] success rate
  151 +# [6] area under ROC curve
  152 +# [7] partial (0.5) area under ROC curve
  153 +# """
  154 +# alldata_tst = ClassificationDataSet(len(df_tst.columns)-len(except_columns),
  155 +# 1, nb_classes=2)
  156 +# trans_set_tst = df_tst.drop(except_columns, axis=1)
  157 +# for n in df_tst.index:
  158 +# if str(df_tst.loc[n].g4) in ['yes','True','1'] or df_tst.loc[n].g4 is True:
  159 +# target = 1.
  160 +# elif str(df_tst.loc[n].g4) in ['no','False','N/A','0'] or df_tst.loc[n].g4 is False:
  161 +# target = 0.
  162 +# else:
  163 +# print '"%s" target not allowed'%df_tst.loc[n].g4, df_tst.loc[n].g4
  164 +# sys.exit(1)
  165 +# alldata_tst.addSample(trans_set_tst.loc[n][:], target)
  166 +# alldata_tst._convertToOneOfMany()
  167 +# default_stdout = sys.stdout
  168 +# custom_stdout = StringIO()
  169 +# sys.stdout = custom_stdout
  170 +# train_results = trainer.testOnData(verbose=True)
  171 +# test_results = trainer.testOnData(alldata_tst, verbose=True)
  172 +# sys.stdout = default_stdout
  173 +# custom_stdout = custom_stdout.getvalue().split('\n')
  174 +# ROC_curves_pred = []
  175 +# ROC_curves_labl = []
  176 +# for no, line in enumerate(custom_stdout):
  177 +# if line == "Testing on data:" and no == 1:
  178 +# pred_lst = []
  179 +# labl_lst = []
  180 +# elif line == "Testing on data:" and no != 1:
  181 +# ROC_curves_pred.append(pred_lst)
  182 +# ROC_curves_labl.append(labl_lst)
  183 +# pred_lst = []
  184 +# labl_lst = []
  185 +# elif line[0:9] == "out: ":
  186 +# pred_lst.append(
  187 +# float(line.lstrip("out: [ ").rstrip("]").split(" ")[0]))
  188 +# labl_lst.append(
  189 +# float(custom_stdout[no+1].lstrip("correct: [ ")
  190 +# .rstrip("]").split(" ")[0]))
  191 +# ROC_curves_pred.append(pred_lst)
  192 +# ROC_curves_labl.append(labl_lst)
  193 +# pred_results = []
  194 +# roc_names=["train","test"]
  195 +# for no ,curve in enumerate(ROC_curves_pred):
  196 +# tp = .0
  197 +# tn = .0
  198 +# fp = .0
  199 +# fn = .0
  200 +# for r in range(len(curve)):
  201 +# if ROC_curves_labl[no][r] == 1 and ROC_curves_pred[no][r] > 0.500:
  202 +# tp += 1
  203 +# elif ROC_curves_labl[no][r] == 0 and ROC_curves_pred[no][r] <= 0.500:
  204 +# tn += 1
  205 +# elif ROC_curves_labl[no][r] == 0 and ROC_curves_pred[no][r] > 0.500:
  206 +# fp += 1
  207 +# elif ROC_curves_labl[no][r] == 1 and ROC_curves_pred[no][r] <= 0.500:
  208 +# fn += 1
  209 +# drawer = roc_curve(ROC_curves_pred[no], ROC_curves_labl[no])
  210 +# subprocess.check_call(["mv", "Rplot001.svg",
  211 +# "./ANN_roc/ROC_%s.svg"%roc_names[no]])
  212 +# pred_results.append([tp,tn,fp,fn,tp+tn+fp+fn,(tp+tn)/(tp+tn+fp+fn),
  213 +# drawer[0], drawer[1]])
  214 +# utils.verbosify(verbose, "ANN tested")
  215 +# return pred_results
  216 +
  217 +def train_network(
  218 + fasta,
  219 + label,
  220 + output):
  221 + """
  222 + Manage functions to train the network.
  223 + """
  224 + except_cols = ['description','sequence','length','g4']
  225 + sets = {'train_set': gen_trainset_df(fasta, label)}
  226 + print(sets['train_set'].columns)
  227 + ann = build_neural_network(sets['train_set'], except_cols, 35,
  228 + SwitchLayer, SigmoidLayer, verbose=False)
  229 +# test_neural_network(sets['train_set'], ann[0], ann[1],
  230 +# except_cols, verbose=True)
  231 + write_pkl(ann[1], output)
  232 +
  233 +
  234 +def arguments():
  235 + """
  236 + Arguments management.
  237 + """
  238 + # declare argument parser
  239 + parser = argparse.ArgumentParser(
  240 + prog=os.path.basename(__file__),
  241 + description="Supports the training of a custom ANN to be supplied "
  242 + "to G4RNA screener/screen.py",
  243 + epilog="G4RNA screener dev Copyright (C) 2019 Jean-Michel Garant "
  244 + "This program comes with ABSOLUTELY NO WARRANTY. This is free "
  245 + "software, and you are welcome to redistribute it under certain "
  246 + "conditions <http://www.gnu.org/licenses/>.")
  247 + # FASTA input from STDIN is supported by default using argument "-"
  248 + parser.add_argument("fasta",
  249 + type=argparse.FileType('r'),
  250 + default=sys.stdin,
  251 + help='FASTA file, - for default STDIN',
  252 + metavar='FASTA')
  253 + parser.add_argument("-l","--label",
  254 + type=str,
  255 + nargs='?',
  256 + default='|label=',
  257 + help='String delimiter for label annotation in fasta description '
  258 + '(default: "|label=")',
  259 + metavar='STR')
  260 + parser.add_argument("-o","--output",
  261 + type=str,
  262 + nargs='?',
  263 + default='output',
  264 + help='String of desired output filename (default: output)',
  265 + metavar='STR')
  266 + if len(sys.argv[1:])==0:
  267 + parser.print_help()
  268 + parser.exit()
  269 + args = parser.parse_args()
  270 + return args
  271 +
  272 +def main():
  273 + """
  274 + Functions calls
  275 + """
  276 + args = arguments()
  277 + print(args)
  278 + train_network(args.fasta, args.label, args.output)
  279 +
  280 +if __name__ == '__main__':
  281 + main()
... ...