#!/usr/bin/perl -w ##### A script to take output from 'cbust' by Martin Frith ##### with -f 0 format and return CisML formatted XML ##### Requires that matrix fasta first lines specify ##### matrix accession then matrix name, separated by one ##### or more spaces or a tab or a pipe "|" $in_options = 0; #Flag to set when get to first hit print < cbust END ; while ($line = <>) { chomp $line; if ($line =~ /^\s*$/) { next; } # Blank line ### Parse Options if ($line =~ /^Sequence\s+file:/) { $in_options = 1; } if ($in_options) { if ($line =~ /^Sequence\s+file:\s+(.*?)\s/) { print "\t\t$1\n"; next; } if ($line =~ /^Matrix\s+file:\s+(.*?)\s/) { print "\t\t$1\n"; next; } if ($line =~ /^Lowercase\s+filtering:\s+(.*)\s/) { $on_off = lc($1); print "\t\t\n"; next; } if ($line =~ /^Cluster\s+score\s+threshold:\s+(.*)\b/) { print "\t\t$1\n"; next; } if ($line =~ /^Motif\s+score\s+threshold:\s+(.*)\b/) { print "\t\t$1\n"; print "\t\n"; next; } if ($line =~ /^Range\s+for\s+local\s+abundances:\s+(.*)/) { print "\t\t$1\n"; next; } if ($line =~ /^Expected gap:\s+(.*)/) { print "\t\t$1\n"; next; } if ($line =~ /^Pseudocount:\s+(.*)/) { print "\t\t$1\n"; next; } } ### Parse Sequences and hits if ($line =~ /^>(.*?)[\s\|](.*)\s\((\d+)/) { $gene_acc = $1; $gene_name = $2; $sequence_length = $3; $sequences{$gene_acc}{'NAME'} = $gene_name; $sequences{$gene_acc}{'LENGTH'} = $sequence_length; } elsif ($line =~ /^CLUSTER\s+(\d+)/) { $cluster_id = $1; <>; #Throw away cluster location <>; #Throw away cluster score <>; #Throw away cluster sequence while ($hit_line = <>) { #Rest are hits until blank chomp $hit_line; if ($hit_line =~ /^\s*$/) { last; } #If we hit a blank, we are done @words = split(/\s+/,$hit_line); $pattern_acc = shift(@words); %element = (); $element{'cluster_id'} = $cluster_id; $element{'pattern_sequence'} = pop(@words); $element{'score'} = pop(@words); $element{'strand'} = pop(@words); $element{'stop'} = pop(@words); pop(@words); # Get rid of 'to' $element{'start'} = pop(@words); $patterns{$pattern_acc} = join(" ",@words); push(@{$hits{$pattern_acc}{$gene_acc}}, {%element}); } } } # End of parsing ## Do output foreach $pattern_acc (sort keys(%hits)) { print "\t\n"; foreach $gene_acc (sort keys(%{$hits{$pattern_acc}})) { print "\t\t\n"; foreach $element (@{$hits{$pattern_acc}{$gene_acc}}) { if ($$element{'strand'} eq '-') { $temp = $$element{'stop'}; $$element{'stop'} = $$element{'start'}; $$element{'start'} = $temp; } print "\t\t\t$$element{'pattern_sequence'}\n"; } print "\t\t\n"; } print "\t\n"; } print "\n"; =head1 NAME =head1 SYNOPSIS =head1 DESCRIPTION =head1 OPTIONS =head1 EXAMPLES =head1 COPYRIGHT # Copyright (C) 2004 by Peter M. Haverty, Trustees of Boston University =head1 AUTHOR Peter M. Haverty phaverty@bu.edu =head1 SEE ALSO =head1 REVISION HISTORY $Id: cbust2cisml.pl,v 1.2 2004/08/09 21:24:56 phaverty Exp $ $Log: cbust2cisml.pl,v $ Revision 1.2 2004/08/09 21:24:56 phaverty Added clusterid Revision 1.1 2004/08/09 21:18:33 phaverty First try =cut