#!/usr/bin/perl -w # A program to parse tfscan output and print CisML formatted data # Read in header lines and get sequence data $line = <>; chomp $line; ($seq_name, $start, $stop) = $line =~ /of\s+(.*)\s+from\s+(\d+)\s+to\s+(\d+)/; $sequence_length = ($stop - $start) + 1; $line = <>; # Read in hits and hash by pattern accession while ($line = <>) { chomp $line; ($pattern_name, $pattern_acc, $hit_start, $hit_stop, $hit_seq) = split(/\s+/,$line); unless (defined($patterns{$pattern_acc})) { $patterns{$pattern_acc} = $pattern_name; } push(@{$hits{$pattern_acc}}, [$hit_start, $hit_stop, $hit_seq]); } #Print CisML header print "\n"; print "\n"; print "\ttfscan\n"; #Print CisML hits foreach $pattern (keys %hits) { print "\t\n"; print "\t\t\n"; foreach $match (@{$hits{$pattern}}) { print "\t\t\t$$match[2]\n"; } print "\t\t\n"; print "\t\n"; } #Print end of CisML print "\n"; =head1 NAME tfscan2cisml.pl =head1 SYNOPSIS tfscan2cisml.pl =head1 DESCRIPTION Converts output of tfscan (http://ocgc.ca/programs/emboss/tfscan.html) to CisML XML format. tfscan output does not provide all of the information generally included in CisML. tfscan is part of the EMBOSS package and can be obtained from http://ocgc.ca/programs/emboss/tfscan.html tfscan output looks like this: TFSCAN of HSFOS from 1 to 6210 HS$CFOS_20 R08485 384 396 agttcccgtcaat DOG$ATP1A_01 R08484 3057 3063 gacatgg HS$CEBPA_01 R08471 4535 4540 cacgtg HS$GPB_05 R08210 3716 3721 gtatct ... =head1 OPTIONS =head1 EXAMPLES tfscan2cisml.pl tfscan_file =head1 COPYRIGHT # Copyright (C) 2004 by Peter M. Haverty, Trustees of Boston University =head1 AUTHOR Peter M. Haverty phaverty@bu.edu =head1 SEE ALSO =head1 REVISION HISTORY $Id: tfscan2cisml.pl,v 1.3 2004/02/22 21:41:37 phaverty Exp $ $Log: tfscan2cisml.pl,v $ Revision 1.3 2004/02/22 21:41:37 phaverty dunno Revision 1.2 2004/02/18 15:26:56 phaverty Changed sequence in matched-element to a sequence element. Revision 1.1 2004/02/18 02:41:28 phaverty Added some thinger2cisml.pl parsers =cut