Commit 79ab76f25b7c52e323b67b1e8a66926acd2d1ece

Authored by Jean-Michel Garant
1 parent 328b7882
Exists in stable-0.3 and in 1 other branch master

rewriting trimer transformation, efficient pandas usage

Showing 3 changed files with 30 additions and 31 deletions   Show diff stats
LICENSE
... ... @@ -620,18 +620,8 @@ copy of the Program in return for a fee.
620 620  
621 621 END OF TERMS AND CONDITIONS
622 622  
623   - How to Apply These Terms to Your New Programs
624   -
625   - If you develop a new program, and you want it to be of the greatest
626   -possible use to the public, the best way to achieve this is to make it
627   -free software which everyone can redistribute and change under these terms.
628   -
629   - To do so, attach the following notices to the program. It is safest
630   -to attach them to the start of each source file to most effectively
631   -state the exclusion of warranty; and each file should have at least
632   -the "copyright" line and a pointer to where the full notice is found.
633   -
634 623 Identification of potential RNA G-quadruplexes by G4RNA screener.
  624 + doi: 10.1093/bioinformatics/btx498.
635 625 Copyright (C) 2018 Jean-Michel Garant
636 626  
637 627 This program is free software: you can redistribute it and/or modify
... ... @@ -647,24 +637,7 @@ the "copyright" line and a pointer to where the full notice is found.
647 637 You should have received a copy of the GNU General Public License
648 638 along with this program. If not, see <http://www.gnu.org/licenses/>.
649 639  
650   -Also add information on how to contact you by electronic and paper mail.
651   -
652   - If the program does terminal interaction, make it output a short
653   -notice like this when it starts in an interactive mode:
654   -
655 640 G4RNA screener Copyright (C) 2018 Jean-Michel Garant
656 641 This program comes with ABSOLUTELY NO WARRANTY. This is free
657 642 software, and you are welcome to redistribute it under certain
658 643 conditions <http://www.gnu.org/licenses/>.
659   -
660   - You should also get your employer (if you work as a programmer) or school,
661   -if any, to sign a "copyright disclaimer" for the program, if necessary.
662   -For more information on this, and how to apply and follow the GNU GPL, see
663   -<http://www.gnu.org/licenses/>.
664   -
665   - The GNU General Public License does not permit incorporating your program
666   -into proprietary programs. If your program is a subroutine library, you
667   -may consider it more useful to permit linking proprietary applications with
668   -the library. If this is what you want to do, use the GNU Lesser General
669   -Public License instead of this License. But first, please read
670   -<http://www.gnu.org/philosophy/why-not-lgpl.html>.
... ...
screen.py
... ... @@ -62,9 +62,10 @@ def apply_network(ann,
62 62 network_file = open(ann,'r')
63 63 ann = pickle.load(network_file)
64 64 network_file.close()
65   - RNome_trans_df = kmer_transfo(RNome_df, 3, 'length', 'sequence', 'g4',
66   - int(wdw_len), jellyfish=False, overlapped=True,
67   - verbose=verbose)
  65 + RNome_trans_df = trimer_transfo(RNome_df, 'sequence', verbose=verbose)
  66 +# RNome_trans_df = kmer_transfo(RNome_df, 3, 'length', 'sequence', 'g4',
  67 +# int(wdw_len), jellyfish=False, overlapped=True,
  68 +# verbose=verbose)
68 69 RNome_df = submit_seq(ann, RNome_trans_df.drop('G4NN',axis=1),
69 70 [c for c in columns if c != 'G4NN'], "G4NN",
70 71 verbose=verbose)
... ...
utils.py
... ... @@ -364,3 +364,28 @@ def kmer_transfo(
364 364 df.loc[row,di_ntd] = freq
365 365 verbosify(verbose, "Kmer transformed")
366 366 return df
  367 +
  368 +def trimer_transfo(
  369 + df_,
  370 + sequence_column,
  371 + verbose=False):
  372 + """
  373 + Define sequences by their 3mers proportions and returns a bigger
  374 + dataframe containing it.
  375 + This version always considers overlapping trimers.
  376 +
  377 + Return pandas dataframe.
  378 + """
  379 + df = df_.copy()
  380 + nts = ['A','U','C','G']
  381 + tri_nts = []
  382 + for nt1 in nts:
  383 + for nt2 in nts:
  384 + for nt3 in nts:
  385 + tri_nts.append([nt1+nt2+nt3,
  386 + "(?P<"+nt1+nt2+nt3+">"+nt1+"(?="+nt2+nt3+"))"])
  387 + for each, pattern in tri_nts:
  388 + df[each] = df[sequence_column].str.upper().str.replace(
  389 + 'T','U').str.count(pattern)/(df[sequence_column].str.len()-2)
  390 + verbosify(verbose, "trimer transformed")
  391 + return df
... ...