Parallel Mira Assembly

advertisement
This Perl script was written by Shichen Wang under the direction of Eduard
Akhunov at Kansas State University (eakhunov@ksu.edu)
Input data instructions
Input for the script is a fastq file, which has additional information about
Block, Phase and Contig.
Format
@DJB775P1:264:D0M7EACXX:3:1107:7206:67178-1 BL:0 PH:0 CT:td-k45_contig_51016
ATCATGCTAGCTGTAGCTGATCGTAGCTAGCTAGCTAGCTGATCGTAGCTAGCTAGCTAG
+
BHBHBH@H@HBHBHBHBHBHBHBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
Reads that belong to the same contig and have the same number of block (BL)
and phase (PH) will be extracted and assembled with Mira.
The final output is "mira_contigs.output.fasta", which contains all the
assemblies.
Program below
#!/usr/bin/perl -w
use strict;
#use Parallel::ForkManager;
# multithread version not implemented yet.
my $sUsage = qq(
****************************************************************************
************
Modify the \$MIRA_BIN and \$CAP3 to fit your own system before running this
script.
Usage:
perl
$0
<num_threads>
<fastq>
Example:
perl
$0
1
phased_elf.fastq
****************************************************************************
*************
);
die $sUsage unless @ARGV == 2;
my($num_threads, $fastq) = @ARGV;
# Path for executable mira and cap3
my $MIRA_BIN = "/home/DNA/Tools/mira_3.2.1_prod_linuxgnu_x86_64_static/bin/mira ";
my $CAP3 = "/home/DNA/Tools/CAP3/cap3 ";
open (IN, "$fastq") or die "can't open file $fastq\n";
my $out_file = "mira_contigs.output.fasta";
open (my $out_fh, ">$out_file") or die $!;
my $count_mira = 0;
my @data;
my $line_counter = 0;
my $pre_id;
while(<IN>)
{
next if /^\s+$/;
chomp;
my $line = $_;
$line_counter++;
#print STDERR 'scalar @data: ', scalar @data, "\n";
if($line_counter % 4 == 1)
{
# @DJB775P1:264:D0M7EACXX:3:1107:7206:67178-1 BL:0 PH:0 CT:tdk45_contig_51016
my $id = $1 if $line=~/(BL.*\d+)$/;
$pre_id = $id unless defined $pre_id;
#print STDERR $pre_id, "\t", $id, "\n";
if($id eq $pre_id)
{
push @data, $line;
}
else
{
run_mira($pre_id, @data);
$pre_id = $id;
@data=();
push @data, $line;
}
}
else
{
push @data, $line;
}
if(eof(IN))
{
run_mira($pre_id, @data);
}
}
close IN;
sub run_mira
{
$count_mira++;
print STDERR "Runnig mira times: ", $count_mira, "\n";
my ($id, @data) = @_;
my $tmp_fasta_file = "data.tmp.fasta";
my $tmp_qual_file = "data.tmp.fasta.qual";
generate_files($tmp_fasta_file, $tmp_qual_file, @data);
my $cmd = $MIRA_BIN . "--project=mira_tmp --job=denovo,solexa,est -fasta=" . $tmp_fasta_file;
print STDERR $cmd, "\n";
eval{ system($cmd)}; return if $@;
my $mira_contig =
"mira_tmp_assembly/mira_tmp_d_results/mira_tmp_out.padded.fasta";
return unless -e $mira_contig;
die if system($CAP3. $mira_contig);
my @cap_outputs = map{$mira_contig.$_}(".cap.contigs",
".cap.singlets");
processing_fasta_file($id, @cap_outputs);
}
sub generate_files
{
my ($fasta, $qual, @data) = @_;
open(F, ">$fasta") or die;
open(Q, ">$qual") or die;
foreach my $ind(0..$#data)
{
if($ind % 4 == 0)
{
$data[$ind]=~s/\@/>/;
print F $data[$ind], "\n";
print Q $data[$ind], "\n";
}
if($ind % 4 ==1)
{
print F $data[$ind], "\n";
}
if($ind % 4 == 3)
{
my @s = split //, $data[$ind];
my @q=map{ord($_)-33}@s;
print Q join(" ", @q), "\n";
}
}
close F;
close Q;
}
sub processing_fasta_file
{
my $pre_id = shift;
my @files = @_;
my %hash;
my $count = 0;
foreach my $f (@files)
{
open (I, $f) or die;
my $id;
while(<I>)
{
chomp;
if(/>/){$count++; $id="C".$count; next}
$hash{$id} .=$_;
}
close I;
}
foreach (keys %hash)
{
print {$out_fh} ">", $_, " ", $pre_id, "\n", $hash{$_}, "\n";
}
}
Download