#!/usr/bin/perl -w # A program to parse MatInspector output and print CisML formatted data # Read hits $ok = 0; while ($line = <>) { chomp $line; if ($line =~ /sequence file:\s+(.*)/) { $sequence_file = $1; } elsif ($line =~ /^-+$/) { $ok = 1; } elsif ($ok) { if ($line =~ /Sequence:\s+(.*)/) { $seq_name = $1; } elsif ($line =~ /^\s*$/) { last; } else { ($pattern_acc, $hit_loc, $core_sim, $mat_sim, $hit_seq) = split(/\s+\|\s+/,$line); $pattern_acc =~ s/\s//g; ($loc_number,$dir) = $hit_loc =~ /(\d+)\s+\(([+-])\)/; if ($dir eq "+") { $hit_start = $loc_number; $hit_stop = ($loc_number + length($hit_seq)) - 1; } else { $hit_start = $loc_number; $hit_stop = ($loc_number - length($hit_seq)) + 1; } unless (defined($patterns{$pattern_acc})) { $patterns{$pattern_acc} = $pattern_acc; } push(@{$hits{$pattern_acc}}, [$hit_start, $hit_stop, $hit_seq, $core_sim, $mat_sim, $seq_name]); } } } #Print CisML header print "\n"; print "\n"; print "\tMatInspector\n"; print "\t\n\t\t$sequence_file\n\t\n"; #Print CisML hits foreach $pattern (keys %hits) { print "\t\n"; print "\t\t\n"; foreach $match (@{$hits{$pattern}}) { print "\t\t\t\n"; print "\t\t\t\t$$match[2]\n"; print "\t\t\t\t$$match[3]\n"; print "\t\t\t\n"; } print "\t\t\n"; print "\t\n"; } #Print CisML footer print "\n"; =head1 NAME tfscan2cisml.pl =head1 SYNOPSIS tfscan2cisml.pl =head1 DESCRIPTION Converts output of MatInspector (http://www.gsf.de/biodv/matinspector.html) to CisML XML format. MatInspector output does not provide all of the information generally included in CisML. matinspector can be obtained from matinspector output looks like this: MatInspector Release 1.0 September 1995 Thu Nov 16 11:48:07 1995 Solution parameters: ~~~~~~~~~~~~~~~~~~~~ sequence file: chr3.seq selected matrices: F$ABF1_01 (ABF1 ) re: 0.13 core sim: 0.80 matrix sim: 0.85 Explanation for column output: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -> Matrix positions correspond to sense strand numbering, but all sequences are given in 5'-3' direction. -> n/a in column 'core simil.' indicates, that no core search was conducted. -> n/a in column 'matrix simil.' indicates, that no matrix similarity was calculated, because the sequences was too short (only the core was found). In that case the core position is given in brackets in column 'matrix position'. -> Capital letters within the sequence indicate the core string. matrix | matrix | core | matrix | sequence name | position(str)| simil. | simil. | ----------------------------------------------------------------------- Sequence: SCCHRIII F$ABF1_01 | 820 (+) | 0.951 | 0.930 | tcgcatcattatgcACGGcttg F$ABF1_01 | 8203 (+) | 0.976 | 0.876 | cgatgtcatagagtACGTgtca F$ABF1_01 | 14114 (-) | 1.000 | 0.914 | acacatccttaaatACGAaagt F$ABF1_01 | 14584 (-) | 0.976 | 0.931 | ggatatcattgcaaACGTcggg F$ABF1_01 | 17119 (-) | 0.951 | 0.877 | actaatcaccgcgaACGGaaac F$ABF1_01 | 24619 (+) | 1.000 | 0.857 | tagtatcgcgctgcACGAgcgt ... F$ABF1_01 | 293524 (-) | 1.000 | 0.908 | ttatatcgccatatACGAaaat F$ABF1_01 | 301286 (+) | 0.976 | 0.861 | ttgggtcatataaaACGTctgc F$ABF1_01 | 304162 (-) | 0.951 | 0.901 | tgtcgtcattgaagACGGtaaa F$ABF1_01 | 309562 (+) | 1.000 | 0.925 | acaagtcattgagaACGAaatt F$ABF1_01 | 314756 (-) | 0.951 | 0.959 | ccacatcattatgcACGGcact In 1 sequences 85 matches to the matrix F$ABF1_01 were found. =head1 OPTIONS =head1 EXAMPLES tfscan2cisml.pl tfscan_file =head1 COPYRIGHT # Copyright (C) 2004 by Peter M. Haverty, Trustees of Boston University =head1 AUTHOR Peter M. Haverty phaverty@bu.edu =head1 SEE ALSO =head1 REVISION HISTORY $Id: matinspector2cisml.pl,v 1.4 2004/02/22 21:41:37 phaverty Exp $ $Log: matinspector2cisml.pl,v $ Revision 1.4 2004/02/22 21:41:37 phaverty dunno Revision 1.3 2004/02/18 15:26:56 phaverty Changed sequence in matched-element to a sequence element. Revision 1.2 2004/02/18 02:50:14 phaverty changed documentation Revision 1.1 2004/02/18 02:41:27 phaverty Added some thinger2cisml.pl parsers =cut