This Perl script was written by Shichen Wang under the direction of Eduard Akhunov at Kansas State University (eakhunov@ksu.edu) Input data instructions Input for the script is a fastq file, which has additional information about Block, Phase and Contig. Format @DJB775P1:264:D0M7EACXX:3:1107:7206:67178-1 BL:0 PH:0 CT:td-k45_contig_51016 ATCATGCTAGCTGTAGCTGATCGTAGCTAGCTAGCTAGCTGATCGTAGCTAGCTAGCTAG + BHBHBH@H@HBHBHBHBHBHBHBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB Reads that belong to the same contig and have the same number of block (BL) and phase (PH) will be extracted and assembled with Mira. The final output is "mira_contigs.output.fasta", which contains all the assemblies. Program below #!/usr/bin/perl -w use strict; #use Parallel::ForkManager; # multithread version not implemented yet. my $sUsage = qq( **************************************************************************** ************ Modify the \$MIRA_BIN and \$CAP3 to fit your own system before running this script. Usage: perl $0 <num_threads> <fastq> Example: perl $0 1 phased_elf.fastq **************************************************************************** ************* ); die $sUsage unless @ARGV == 2; my($num_threads, $fastq) = @ARGV; # Path for executable mira and cap3 my $MIRA_BIN = "/home/DNA/Tools/mira_3.2.1_prod_linuxgnu_x86_64_static/bin/mira "; my $CAP3 = "/home/DNA/Tools/CAP3/cap3 "; open (IN, "$fastq") or die "can't open file $fastq\n"; my $out_file = "mira_contigs.output.fasta"; open (my $out_fh, ">$out_file") or die $!; my $count_mira = 0; my @data; my $line_counter = 0; my $pre_id; while(<IN>) { next if /^\s+$/; chomp; my $line = $_; $line_counter++; #print STDERR 'scalar @data: ', scalar @data, "\n"; if($line_counter % 4 == 1) { # @DJB775P1:264:D0M7EACXX:3:1107:7206:67178-1 BL:0 PH:0 CT:tdk45_contig_51016 my $id = $1 if $line=~/(BL.*\d+)$/; $pre_id = $id unless defined $pre_id; #print STDERR $pre_id, "\t", $id, "\n"; if($id eq $pre_id) { push @data, $line; } else { run_mira($pre_id, @data); $pre_id = $id; @data=(); push @data, $line; } } else { push @data, $line; } if(eof(IN)) { run_mira($pre_id, @data); } } close IN; sub run_mira { $count_mira++; print STDERR "Runnig mira times: ", $count_mira, "\n"; my ($id, @data) = @_; my $tmp_fasta_file = "data.tmp.fasta"; my $tmp_qual_file = "data.tmp.fasta.qual"; generate_files($tmp_fasta_file, $tmp_qual_file, @data); my $cmd = $MIRA_BIN . "--project=mira_tmp --job=denovo,solexa,est -fasta=" . $tmp_fasta_file; print STDERR $cmd, "\n"; eval{ system($cmd)}; return if $@; my $mira_contig = "mira_tmp_assembly/mira_tmp_d_results/mira_tmp_out.padded.fasta"; return unless -e $mira_contig; die if system($CAP3. $mira_contig); my @cap_outputs = map{$mira_contig.$_}(".cap.contigs", ".cap.singlets"); processing_fasta_file($id, @cap_outputs); } sub generate_files { my ($fasta, $qual, @data) = @_; open(F, ">$fasta") or die; open(Q, ">$qual") or die; foreach my $ind(0..$#data) { if($ind % 4 == 0) { $data[$ind]=~s/\@/>/; print F $data[$ind], "\n"; print Q $data[$ind], "\n"; } if($ind % 4 ==1) { print F $data[$ind], "\n"; } if($ind % 4 == 3) { my @s = split //, $data[$ind]; my @q=map{ord($_)-33}@s; print Q join(" ", @q), "\n"; } } close F; close Q; } sub processing_fasta_file { my $pre_id = shift; my @files = @_; my %hash; my $count = 0; foreach my $f (@files) { open (I, $f) or die; my $id; while(<I>) { chomp; if(/>/){$count++; $id="C".$count; next} $hash{$id} .=$_; } close I; } foreach (keys %hash) { print {$out_fh} ">", $_, " ", $pre_id, "\n", $hash{$_}, "\n"; } }