#!/usr/bin/perl -w ##### A script to take output from 'possum' by Martin Frith ##### and return CisML formatted XML ##### Requires that matrix fasta first lines specify ##### matrix accession then matrix name, separated by one ##### or more spaces or a tab or a pipe "|" $in_hits = 0; #Flag to set when get to first hit print < POSSUM END ; while ($line = <>) { chomp $line; if ($line =~ /^\s*$/) { next; } # Blank line ### Parse Options unless ($in_hits) { if ($line =~ /^Sequence\s+file:\s+(.*)\b/) { print "\t\t$1\n"; $seqs_ref = &parse_sequences_file($1); } if ($line =~ /^Matrix\s+file:\s+(.*)\b/) { print "\t\t$1\n"; } if ($line =~ /^Lowercase\s+filtering:\s+(.*)\b/) { $on_off = lc($1); print "\t\t\n"; } if ($line =~ /^Score\s+threshold:\s+(.*)\b/) { print "\t\t$1\n"; } if ($line =~ /^Range\s+for\s+local\s+abundances:\s+(.*)\b/) { print "\t\t$1\n"; } if ($line =~ /^Pseudocount:\s+(.*)\b/) { print "\t\t$1\n"; print "\t\n"; $in_hits = 1; next; } } ### Parse Sequences if ($in_hits) { if ($line =~ /^>(.*?)[\s\|](.*)/) { $gene_acc = $1; $gene_name = $2; $sequences{$gene_acc} = $gene_name; } else { #Blank lines skipped. Rest are seqs or hits @words = split(/\s+/,$line); $pattern_acc = shift(@words); %element = (); $element{'score'} = pop(@words); $element{'pattern_sequence'} = pop(@words); $element{'strand'} = pop(@words); $element{'stop'} = pop(@words); pop(@words); # Get rid of 'to' $element{'start'} = pop(@words); $patterns{$pattern_acc} = join(" ",@words); push(@{$hits{$pattern_acc}{$gene_acc}}, {%element}); } } } # End of parsing foreach $pattern_acc (sort keys(%hits)) { print "\t\n"; foreach $gene_acc (sort keys(%{$hits{$pattern_acc}})) { $length = length($$seqs_ref{$gene_acc}); print "\t\t\n"; foreach $element (@{$hits{$pattern_acc}{$gene_acc}}) { if ($$element{'strand'} eq '-') { $temp = $$element{'stop'}; $$element{'stop'} = $$element{'start'}; $$element{'start'} = $temp; } print "\t\t\t$$element{'pattern_sequence'}\n"; } print "\t\t\n"; } print "\t\n"; } print "\n"; sub parse_sequences_file { my $sequence_file = shift; my %seqs = (); open(SEQ,$sequence_file) || die "Failed to open sequences file: $sequence_file\n"; while (my $line = ) { chomp $line; if ($line =~ /^>(.*?)[\s\|](.*)/) { $gene_acc = $1; $gene_name = $2; } else { $seqs{$gene_acc} .= $line; } } return \%seqs; } =head1 NAME =head1 SYNOPSIS =head1 DESCRIPTION =head1 OPTIONS =head1 EXAMPLES =head1 COPYRIGHT # Copyright (C) 2004 by Peter M. Haverty, Trustees of Boston University =head1 AUTHOR Peter M. Haverty phaverty@bu.edu =head1 SEE ALSO =head1 REVISION HISTORY $Id: possum2cisml.pl,v 1.6 2004/08/10 14:17:34 phaverty Exp $ $Log: possum2cisml.pl,v $ Revision 1.6 2004/08/10 14:17:34 phaverty added cbust parser and files Revision 1.5 2004/07/27 18:22:24 phaverty Took out extra print. Revision 1.4 2004/07/27 18:02:52 phaverty Fixed sequence-filtering Revision 1.3 2004/07/27 17:57:49 phaverty Added in end tags Revision 1.2 2004/07/27 17:55:37 phaverty Added getting length of sequences. Revision 1.1 2004/07/27 17:06:08 phaverty it is what it is =cut