#!/usr/bin/perl -w

##基本实现原始pipeline的第一步，仍然有个别的位置无法完整对齐。
##这个版本，主要是修复出现在two_best_blat_rel.log中的ID，其fix位点超量的情况

use strict;
use Getopt::Long;
use Pod::Usage;
use POSIX;
use POSIX ":sys_wait_h";



my $man = '';
my $help= '';

my $blat_rel_input_file1='';
my $blat_rel_input_file2='';
my $blat_rel_input_file3='';
my $blat_rel_threshold='0.8';  #这个参数应该要可以修改

my $split_infile='100'; #这个参数应该要可以修改
my $cpu_th="8"; #这个参数应该要可以修改
my $blastdbcmd="blastdbcmd";

my $db_file1='';
my $db_file2='';
my $db_file3='';

my $vcf_file1='';
my $vcf_file2='';
my $vcf_file3='';
my $split_vcf='T';

my $indi_acc1='';
my $indi_acc2='';
my $indi_acc3='';

my $seq_file1='';
#my $seq_file2='';
#my $seq_file3='';

my $version = '';

open (TBBR,">two_best_blat_rel.log") or die "wrong, can not open two_best_blat_rel.log file\n";
open (SNP_COLLECT,">collect_snp_cmd.log") or die "wrong, can not open collect_snp_cmd.log file\n";


GetOptions('help|?' => \$help,
	   man => \$man,
	   'blat_in1=s'=>\$blat_rel_input_file1,
	   'blat_in2=s'=>\$blat_rel_input_file2,
	   'blat_in3=s'=>\$blat_rel_input_file3,
	   'blat_th=s'=>\$blat_rel_threshold,

	   'db_file1=s'=>\$db_file1,
	   'db_file2=s'=>\$db_file2,
	   'db_file3=s'=>\$db_file3,

	   'vcf_file1=s'=>\$vcf_file1,
	   'vcf_file2=s'=>\$vcf_file2,
	   'vcf_file3=s'=>\$vcf_file3,
	   'split_vcf=s'=>\$split_vcf,
	   'split_infile=s'=>\$split_infile,
	   'cpu_th=s'=>\$cpu_th,
	   'blastdbcmd=s'=>\$blastdbcmd,

	   'indi_acc1=s'=>\$indi_acc1,
	   'indi_acc2=s'=>\$indi_acc2,
	   'indi_acc3=s'=>\$indi_acc3,

	   'seq_file1=s'=>\$seq_file1,
#	   'seq_file2=s'=>\$seq_file2,
#	   'seq_file3=s'=>\$seq_file3,

	   'version'=> \$version
    )
    or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;

my $blat_tmp1=$blat_rel_input_file1."_for_mk_test"; ##用于后续用途的blat结果
my $blat_tmp2=$blat_rel_input_file2."_for_mk_test"; ##用于后续用途的blat结果
my $blat_tmp3=$blat_rel_input_file3."_for_mk_test"; ##用于后续用途的blat结果

my %tmpfile_hash1=&read_fasta_file_new($seq_file1,"space");

#goto VCF;

if ($split_vcf eq "T"){
    my $vcf_cmd=qq(grep -v "#" $vcf_file1 |awk '{print \$1}'|awk '!a[\$1]++');
    my $vcf=`$vcf_cmd`;
    my @vcf1=split/\n/,$vcf;
    foreach (@vcf1){
	$vcf_cmd=qq(grep $_ $vcf_file1 >$_.$vcf_file1);
	system $vcf_cmd;
    } ##把vcf文件按照染色体，或者scaffold分组

    $vcf_cmd=qq(grep -v "#" $vcf_file2 |awk '{print \$1}'|awk '!a[\$1]++');
    $vcf=`$vcf_cmd`;
    my @vcf2=split/\n/,$vcf;
    foreach (@vcf2){
	$vcf_cmd=qq(grep $_ $vcf_file2 >$_.$vcf_file2);
	system $vcf_cmd;
    }

    $vcf_cmd=qq(grep -v "#" $vcf_file3 |awk '{print \$1}'|awk '!a[\$1]++');
    $vcf=`$vcf_cmd`;
    my @vcf3=split/\n/,$vcf;
    foreach (@vcf3){
	$vcf_cmd=qq(grep $_ $vcf_file3 >$_.$vcf_file3);
	system $vcf_cmd;
    }
}

#VCF:

#if (-e $blat_tmp1){
#    print "please move your previous result $blat_tmp1\n";
#    exit;
#}

#if (-e $blat_tmp2){
#    print "please move your previous result $blat_tmp2\n";
#    exit;
#}

#if (-e $blat_tmp3){
#    print "please move your previous result $blat_tmp3\n";
#    exit;
#}

#下面一段的判断，可能过于简单~

my $cmd =qq!sed '1,5d' $blat_rel_input_file1|awk '{if (\$11>0){ if (\$1/\$11>$blat_rel_threshold){print \$0} }}' >$blat_tmp1!;
system $cmd;
$cmd =qq!sed '1,5d' $blat_rel_input_file2|awk '{if (\$11>0){ if (\$1/\$11>$blat_rel_threshold){print \$0} }}' >$blat_tmp2!;
system $cmd;
$cmd =qq!sed '1,5d' $blat_rel_input_file3|awk '{if (\$11>0){ if (\$1/\$11>$blat_rel_threshold){print \$0} }}' >$blat_tmp3!;
system $cmd;

#VCF:

my %hash;   ###保存对应的一系列数据

$cmd=time();
my $time_spot=$cmd."_fasta";
mkdir ($time_spot);

if ($split_infile>0){
    $cmd="mkdir sub_"."$vcf_file1";
    system $cmd;
    $cmd="split -l $split_infile $blat_tmp1 -d -a 4 $vcf_file1"."_";
    system $cmd;
    $cmd="mv $vcf_file1"."_* sub_"."$vcf_file1";
    system $cmd;
}

$cmd="ls sub_".$vcf_file1;
my $sub_file=`$cmd`;
my @sub_files=split/\n/,$sub_file;

my @all_pids;

my $pid;
#my $ll; ### exitflag
#应该到这里为止，都不会出错，除了判断的阈值不太严谨之外

foreach my $sub_blat_tmp1 (@sub_files){

    $pid=fork();
    
    if (not defined $pid){
	print "resources not avilable.\n";
    }elsif($pid==0){

	$sub_blat_tmp1="sub_".$vcf_file1."/$sub_blat_tmp1";    
	open (TBBRX,">$sub_blat_tmp1"."_cmd.log") or die "wrong, can not open $sub_blat_tmp1"."_cmd.log file\n";
	open (FILE,"$sub_blat_tmp1") or die "wrong, can not open file $blat_tmp1\n";
	while (<FILE>){
#	    last if $ll++>30; ### exitflag;
	    chomp;
	    my @lines=split/\s+/,$_; ##对blat结果进行分列
	    my $chro=$lines[13];     ##
	    my @blocksizes=split/\,/,$lines[18]; ###
	    my @tstart=split/\,/,$lines[20]; ###
	    my @qstart=split/\,/,$lines[19]; ###
	    if (exists $hash{"$lines[9]"}{"all"}){
		print TBBR $lines[9]," this IDs have presented twice, this time is ignored! $_\n";
		next;
	    }
	    for (my $i=0;$i<=$#tstart;$i++){
		my $start=$tstart[$i]+1; ##每一段匹配的起始位置
		my $stop=$tstart[$i]+$blocksizes[$i];  ##每一段匹配的种质
		$hash{"$lines[9]"}{"all"}=$#tstart; #即有几个匹配，可能需要与有几个CDS对应
		$hash{"$lines[9]"}{"qstart"}{$i}=$qstart[$i];

		my $seq;  ##取出的每一段序列
		my @seq;  ##取出的每一段序列,分行
		my $tmp_seq;  ##取出的每一段序列的单纯序列，去掉了末尾
		my $rel;   ##在对应区间的物种1的所有SNP
		my @rel;   ##在对应区间的物种1的所有SNP,分行
		my $snps;  ##在对应区间内所有本物种SNP的每一次内容
		my @snp_lines; ##对SNP数据进行分析
		my $tmp_space;  ##SNP对应的位置，用于替换碱基 

		if ($lines[8] eq "-"){
		    $cmd="$blastdbcmd -db $db_file1 -entry $chro -range $start-$stop";
		    print SNP_COLLECT "$lines[9]\t$cmd\n";
		    print TBBRX "$lines[9]\t$cmd\n";
		    $hash{"$lines[9]"}{"strand"}="minus";
		    $hash{"$lines[9]"}{"cmd"}{$i}=$cmd;  ##每一段匹配的cmd
		    $seq=`$cmd`;
		    @seq=split/\n/,$seq;
		    foreach (@seq){
			next if $_=~/>/;
			$tmp_seq.=$_;
		    } #对应区间的所有序列整合到一起。
		    $hash{"$lines[9]"}{"seq"}{$i}=$tmp_seq;##这个$i是指总共有几个匹配区域
		    $tmp_seq="";
		    $cmd=qq[awk '{if (\$1 == "$chro" && \$2>=$start && \$2<=$stop){print \$0} }' $chro.$vcf_file1];
		    print SNP_COLLECT "$lines[9]\t$cmd\n";
		    print TBBRX "$lines[9]\t$cmd\n";
#这个步骤极为耗时~ 如果有办法缩短时间，则太棒~
		    $rel=`$cmd`;
		    @rel=split/\n/,$rel;
##这里存在一个问题，如果没有找到在区间内的snp#
#//
#//
		    $snps=0;
		    if (@rel){
			foreach (@rel){
			    $snps++;
			    @snp_lines=split/\s+/,$_;
			    $tmp_space=$snp_lines[1]-$start;
			    $hash{"$lines[9]"}{"cnt"}{$i}{$snps}=$tmp_space;
			    for (my $j=0;$j<$indi_acc1;$j++){
				$hash{"$lines[9]"}{"snp"}{$i}{$snps}{$j}=&given_snp($snp_lines[9+$j],@snp_lines);
			    }
			}
		    }
		}else{###似乎没有任何差异啊。。。。？？？？
		    $cmd="$blastdbcmd -db $db_file1 -entry $chro -range $start-$stop";
		    print SNP_COLLECT "$lines[9]\t$cmd\n";
		    print TBBRX "$lines[9]\t$cmd\n";
		    $hash{"$lines[9]"}{"strand"}="plus";
		    $hash{"$lines[9]"}{"cmd"}{$i}=$cmd;
		    $seq=`$cmd`;
		    @seq=split/\n/,$seq;
		    foreach (@seq){
			next if $_=~/>/;
			$tmp_seq.=$_;
		    }
		    $hash{"$lines[9]"}{"seq"}{$i}=$tmp_seq;
		    $tmp_seq="";
		    $cmd=qq[awk '{if (\$1 == "$chro" && \$2>=$start && \$2<=$stop){print \$0} }' $chro.$vcf_file1];
		    print SNP_COLLECT "$lines[9]\t$cmd\n";
		    print TBBRX "$lines[9]\t$cmd\n";
		    $rel=`$cmd`;
		    @rel=split/\n/,$rel;
		    $snps=0;
		    if (@rel){
			foreach (@rel){
			    $snps++;
			    @snp_lines=split/\s+/,$_;
			    $tmp_space=$snp_lines[1]-$start;
			    $hash{"$lines[9]"}{"cnt"}{$i}{$snps}=$tmp_space;
			    for (my $j=0;$j<$indi_acc1;$j++){
				$hash{"$lines[9]"}{"snp"}{$i}{$snps}{$j}=&given_snp($snp_lines[9+$j],@snp_lines);
			    }
			}
		    }
		}
	    }
	    
	}
	close FILE;

	foreach my $key (sort keys %hash){
	    my @all_seq;
	    for (my $k=0;$k<$indi_acc1;$k++){
		$all_seq[$k]="";
	    }
	    if ($hash{$key}{"strand"} eq "plus"){
		for (my $i=0;$i<=$hash{$key}{"all"};$i++){##这里是每一个exon的循环
		    my @tmp_plus_seq;
		    my @plus_snps;
		    if (exists $hash{$key}{"seq"}{$i}){
			for (my $m=0;$m<$indi_acc1;$m++){
			    $tmp_plus_seq[$m]="";###不晓得是否可以解决问题
			    $tmp_plus_seq[$m]=$hash{$key}{"seq"}{$i};
			}
			foreach my $tmpkey (sort keys %{$hash{$key}{"snp"}{$i}}){###这里是每一个snp位置
			    if (exists $hash{$key}{"snp"}{$i}{$tmpkey}){
				foreach my $individual_tmp_key (sort keys %{$hash{$key}{"snp"}{$i}{$tmpkey}}){###这里是每一个snp位置有必要的时候，需要替换snp
				    my $my_tmp_plus_seq1=substr($tmp_plus_seq[$individual_tmp_key],0,$hash{$key}{"cnt"}{$i}{$tmpkey});
				    print TBBRX qq+$key\t$my_tmp_plus_seq1=substr($tmp_plus_seq[$individual_tmp_key],0,$hash{$key}{"cnt"}{$i}{$tmpkey});\n+;
				    my $my_tmp_plus_seq2=$hash{$key}{"snp"}{$i}{$tmpkey}{$individual_tmp_key};
				    if (not $my_tmp_plus_seq2){
					print "$key,$i,$tmpkey,$individual_tmp_key,AAAAAAAAA \n";
					exit;
				    }
				    my $my_tmp_plus_seq3=substr($tmp_plus_seq[$individual_tmp_key],$hash{$key}{"cnt"}{$i}{$tmpkey}+1);
				    print TBBRX qq@$key\t$my_tmp_plus_seq3=substr($tmp_plus_seq[$individual_tmp_key],$hash{$key}{"cnt"}{$i}{$tmpkey}+1);\n@;
				    $tmp_plus_seq[$individual_tmp_key]=$my_tmp_plus_seq1.$my_tmp_plus_seq2.$my_tmp_plus_seq3;
				    print TBBRX qq@$key\t$tmp_plus_seq[$individual_tmp_key]=$my_tmp_plus_seq1.$my_tmp_plus_seq2.$my_tmp_plus_seq3;\n@;
				   
				}
			    }
			}
			for (my $k=0;$k<$indi_acc1;$k++){
			    my $tmp_space="";
			    for (my $cb=0;$cb<$hash{$key}{'qstart'}{$i};$cb++){
				$tmp_space.="X";
			    }
			    $tmp_plus_seq[$k]=$tmp_space.$tmp_plus_seq[$k];
			    print TBBRX qq@$key\t$tmp_plus_seq[$k]=$tmp_space.$tmp_plus_seq[$k];\n@;
			    my $tmp_length=length($all_seq[$k]);
			    $tmp_plus_seq[$k]=substr($tmp_plus_seq[$k],$tmp_length);
			    print TBBRX qq@$key\t$tmp_plus_seq[$k]=substr($tmp_plus_seq[$k],$tmp_length);\n@;
			    $all_seq[$k].=$tmp_plus_seq[$k];
			    print TBBRX qq@$key\t$all_seq[$k].=$tmp_plus_seq[$k];\n@;
			}
		    }
		}
	    }else{
#		last; ###exitflag
		for (my $i=$hash{$key}{"all"};$i>=0;$i--){##这里是每一个exon的循环
		    my @tmp_minus_seq;
		    my @minus_snps;
		    if (exists $hash{$key}{"seq"}{$i}){
			for (my $m=0;$m<$indi_acc1;$m++){
			    $tmp_minus_seq[$m]="";
			    $tmp_minus_seq[$m]=$hash{$key}{"seq"}{$i};
			}
			foreach my $tmpkey (sort keys %{$hash{$key}{"snp"}{$i}}){###这里是每一个snp位置
			    if (exists $hash{$key}{"snp"}{$i}{$tmpkey}){
				foreach my $individual_tmp_key (sort keys %{$hash{$key}{"snp"}{$i}{$tmpkey}}){###这里是每一个snp位置有必要的时候，需要替换snp
				    my $my_tmp_minus_seq1=substr($tmp_minus_seq[$individual_tmp_key],0,$hash{$key}{"cnt"}{$i}{$tmpkey});
				    my $my_tmp_minus_seq2=$hash{$key}{"snp"}{$i}{$tmpkey}{$individual_tmp_key};
				    my $my_tmp_minus_seq3=substr($tmp_minus_seq[$individual_tmp_key],$hash{$key}{"cnt"}{$i}{$tmpkey}+1);
				    $tmp_minus_seq[$individual_tmp_key]=$my_tmp_minus_seq1.$my_tmp_minus_seq2.$my_tmp_minus_seq3;
				}
			    }
			}
			for (my $k=0;$k<$indi_acc1;$k++){
			    print TBBRX "$key\t$tmp_minus_seq[$k]\n";
			    $tmp_minus_seq[$k]=reverse $tmp_minus_seq[$k];
			    print TBBRX "$key\t$tmp_minus_seq[$k]\n";
			    $tmp_minus_seq[$k]=~ tr/ACGTacgt/TGCAtgca/;
			    print TBBRX "$key\t$tmp_minus_seq[$k]\n";
			    $tmp_minus_seq[$k]=~ tr/MYKRmykr/KRMYkrmy/;
			    print TBBRX "$key\t$tmp_minus_seq[$k]\n";
			    $tmp_minus_seq[$k]=~ tr/HVDBhvdb/DBHVdbhv/;
			    print TBBRX "$key\t$tmp_minus_seq[$k]\n";
			    my $tmp_space="";
			    for (my $cb=0;$cb<$hash{$key}{'qstart'}{$i};$cb++){
				$tmp_space.="X";
			    }
			    $tmp_minus_seq[$k]=$tmp_minus_seq[$k].$tmp_space;
			    print TBBRX "$key\t$tmp_minus_seq[$k]=$tmp_minus_seq[$k].$tmp_space;\n";
			    my $tmp_length=length($tmp_minus_seq[$k]); ##188
			    my $tmp_ori_le=length($tmpfile_hash1{"$key"});###210 这个是原始的CDS序列
			    if ($all_seq[$k] ne ""){
				if (length($tmp_minus_seq[$k])>$hash{$key}{'qstart'}{$i+1}){
				    my $tmp_min_le=$tmp_ori_le-$hash{$key}{'qstart'}{$i+1};
				    $all_seq[$k]=substr($all_seq[$k],0,$tmp_min_le).$tmp_minus_seq[$k];
				    print TBBRX "$key\t$all_seq[$k]=substr($all_seq[$k],0,$tmp_min_le).$tmp_minus_seq[$k];\n";
				}else{
				    my $tmp_min_le=$tmp_ori_le-length($tmp_minus_seq[$k]);
				    $all_seq[$k]=substr($all_seq[$k],0,$tmp_min_le).$tmp_minus_seq[$k];
				    print TBBRX "$key\t$all_seq[$k]=substr($all_seq[$k],0,$tmp_min_le).$tmp_minus_seq[$k];\n";
				}
			    }else{
				my $tmp_add_X=$tmp_ori_le-$tmp_length;
				my $add_x_seq="";
				for (my $adx=0;$adx<$tmp_add_X;$adx++){
				    $add_x_seq.= "X";
				}
				$all_seq[$k]=$add_x_seq.$tmp_minus_seq[$k];
				print TBBRX "$key\t$all_seq[$k]=$add_x_seq.$tmp_minus_seq[$k];\n";
			    }
			}
		    }

		}
	    }

	    open (SEQOUT,">>$time_spot/$key.fasta") or die "wrong";
	    print SEQOUT ">",$key,"\n";#, $plus_seq,"\n";
	    my $tail_length=length($all_seq[0]);
	    my $origi_length=length($tmpfile_hash1{$key});
	    my $diff_tail_origi=$origi_length-$tail_length;
	    if ($tail_length<$origi_length){
		for (my $k=0;$k<$indi_acc1;$k++){
		    print SEQOUT "$all_seq[$k]";
		    for (my $atl=0;$atl<$diff_tail_origi;$atl++){
			print SEQOUT "X";
		    }
		    print SEQOUT "\n";
		}
	    }else{
		for (my $k=0;$k<$indi_acc1;$k++){
		    print SEQOUT "$all_seq[$k]\n";
		}
	    }
	    print SEQOUT $tmpfile_hash1{"$key"},"\n";
	    close SEQOUT;
	}
	close TBBRX;
	exit;
    }else{
#	last;
	push(@all_pids, "$pid");
	while ($#all_pids>$cpu_th){
	    sleep 1;		
	    if($pid = waitpid(-1, WNOHANG)){
		shift @all_pids;
	    }
	}
    }
}

#$ll=0;

while ($#all_pids>-1){
    sleep 1;
    if($pid = waitpid(-1, WNOHANG)){
	shift @all_pids;
    }
}

print "The First Step Is Finished!\n";

### step1 ##########################################################################################

undef %hash;   ###保存对应的一系列数据

if ($split_infile>0){
    $cmd="mkdir sub_"."$vcf_file2";
    system $cmd;
    $cmd="split -l $split_infile $blat_tmp2 -d -a 4 $vcf_file2"."_";
    system $cmd;
    $cmd="mv $vcf_file2"."_* sub_"."$vcf_file2";
    system $cmd;
}

$cmd="ls sub_".$vcf_file2;
$sub_file=`$cmd`;
undef @sub_files;
@sub_files=split/\n/,$sub_file;
undef @all_pids;

undef $pid;

foreach my $sub_blat_tmp2 (@sub_files){
    
    $pid=fork();
    
    if (not defined $pid){
	print "resources not avilable.\n";
    }elsif($pid==0){
	$sub_blat_tmp2="sub_".$vcf_file2."/$sub_blat_tmp2";    
	


open (FILE,"$sub_blat_tmp2") or die "wrong, can not open file $sub_blat_tmp2\n";
while (<FILE>){
#    last if $ll++>30; ### exitflag;
    chomp;
    my @lines=split/\s+/,$_; ##对blat结果进行分列

    if (exists $hash{"$lines[9]"}{"all"}){
	print TBBR $lines[9]," this IDs have presented twice, this time is ignored! $_\n";
	next;
    }

    my $chro=$lines[13];     ##
    my @blocksizes=split/\,/,$lines[18]; ###
    my @tstart=split/\,/,$lines[20]; ###
    my @qstart=split/\,/,$lines[19]; ###
    for (my $i=0;$i<=$#tstart;$i++){
        my $start=$tstart[$i]+1; ##每一段匹配的起始位置
        my $stop=$tstart[$i]+$blocksizes[$i];  ##每一段匹配的种质
        $hash{"$lines[9]"}{"all"}=$#tstart; #即有几个匹配，可能需要与有几个CDS对应
	$hash{"$lines[9]"}{"qstart"}{$i}=$qstart[$i];
	my $seq;  ##取出的每一段序列
	my @seq;  ##取出的每一段序列,分行
	my $tmp_seq;  ##取出的每一段序列的单纯序列，去掉了末尾
	my $rel;   ##在对应区间的物种1的所有SNP
	my @rel;   ##在对应区间的物种1的所有SNP,分行
	my $snps;  ##在对应区间内所有本物种SNP的每一次内容
	my @snp_lines; ##对SNP数据进行分析
	my $tmp_space;  ##SNP对应的位置，用于替换碱基 

        if ($lines[8] eq "-"){
            $cmd="$blastdbcmd -db $db_file2 -entry $chro -range $start-$stop";
	    print SNP_COLLECT "$lines[9]\t$cmd\n";
            $hash{"$lines[9]"}{"strand"}="minus";
            $hash{"$lines[9]"}{"cmd"}{$i}=$cmd;  ##每一段匹配的cmd
            $seq=`$cmd`;
            @seq=split/\n/,$seq;
            foreach (@seq){
                next if $_=~/>/;
                $tmp_seq.=$_;
            }
            $hash{"$lines[9]"}{"seq"}{$i}=$tmp_seq;
            $tmp_seq="";
            $cmd=qq[awk '{if (\$1 == "$chro" && \$2>=$start && \$2<=$stop){print \$0} }' $chro.$vcf_file2];
            $rel=`$cmd`;
	    print SNP_COLLECT "$lines[9]\t$cmd\n";
            @rel=split/\n/,$rel;
            $snps=0;
	    if (@rel){
		foreach (@rel){
		    $snps++;
		    @snp_lines=split/\s+/,$_;
		    $tmp_space=$snp_lines[1]-$start;
		    $hash{"$lines[9]"}{"cnt"}{$i}{$snps}=$tmp_space;
		    for (my $j=0;$j<$indi_acc2;$j++){
			$hash{"$lines[9]"}{"snp"}{$i}{$snps}{$j}=&given_snp($snp_lines[9+$j],@snp_lines);
		    }
		}
            }
        }else{
            $cmd="$blastdbcmd -db $db_file2 -entry $chro -range $start-$stop";
	    print SNP_COLLECT "$lines[9]\t$cmd\n";
            $hash{"$lines[9]"}{"strand"}="plus";
            $hash{"$lines[9]"}{"cmd"}{$i}=$cmd;
            $seq=`$cmd`;
            @seq=split/\n/,$seq;
            foreach (@seq){
                next if $_=~/>/;
                $tmp_seq.=$_;
            }
            $hash{"$lines[9]"}{"seq"}{$i}=$tmp_seq;
            $tmp_seq="";
            $cmd=qq[awk '{if (\$1 == "$chro" && \$2>=$start && \$2<=$stop){print \$0} }' $chro.$vcf_file2];
	    print SNP_COLLECT "$lines[9]\t$cmd\n";
            $rel=`$cmd`;
            @rel=split/\n/,$rel;
            $snps=0;
	    if (@rel){
		foreach (@rel){
		    $snps++;
		    @snp_lines=split/\s+/,$_;
		    $tmp_space=$snp_lines[1]-$start;
		    $hash{"$lines[9]"}{"cnt"}{$i}{$snps}=$tmp_space;
		    for (my $j=0;$j<$indi_acc2;$j++){
			$hash{"$lines[9]"}{"snp"}{$i}{$snps}{$j}=&given_snp($snp_lines[9+$j],@snp_lines);
		    }
		}
	    }
        }
    }
}
close FILE;


foreach my $key (sort keys %hash){
    my @all_seq;
    for (my $k=0;$k<$indi_acc1;$k++){
	$all_seq[$k]="";
    }

    if ($hash{$key}{"strand"} eq "plus"){#############这个中间的两段，应该是最容易出错的部分，需要仔细核对
        for (my $i=0;$i<=$hash{$key}{"all"};$i++){##这里是每一个exon的循环/每一个匹配
            my @tmp_plus_seq;
            my @plus_snps;
            if (exists $hash{$key}{"seq"}{$i}){
                for (my $m=0;$m<$indi_acc2;$m++){
                    $tmp_plus_seq[$m]=$hash{$key}{"seq"}{$i};
                }
                foreach my $tmpkey (sort keys %{$hash{$key}{"snp"}{$i}}){###这里是每一个snp位置
                    foreach my $individual_tmp_key (sort keys %{$hash{$key}{"snp"}{$i}{$tmpkey}}){###这里是每一个snp位置有必要的时候，需要替换snp
			my $my_tmp_plus_seq1=substr($tmp_plus_seq[$individual_tmp_key],0,$hash{$key}{"cnt"}{$i}{$tmpkey});
			my $my_tmp_plus_seq2=$hash{$key}{"snp"}{$i}{$tmpkey}{$individual_tmp_key};
			if (not $my_tmp_plus_seq2){
			    print "$key,$i,$tmpkey,$individual_tmp_key,AAAAAAAAA \n";
			    exit;
			}
			my $my_tmp_plus_seq3=substr($tmp_plus_seq[$individual_tmp_key],$hash{$key}{"cnt"}{$i}{$tmpkey}+1);
			$tmp_plus_seq[$individual_tmp_key]=$my_tmp_plus_seq1.$my_tmp_plus_seq2.$my_tmp_plus_seq3;
                    }
                }
		for (my $k=0;$k<$indi_acc2;$k++){
		    my $tmp_space="";
		    for (my $cb=0;$cb<$hash{$key}{'qstart'}{$i};$cb++){
			$tmp_space.="X";
		    }
		    $tmp_plus_seq[$k]=$tmp_space.$tmp_plus_seq[$k];
		    my $tmp_length=length($all_seq[$k]);
		    $tmp_plus_seq[$k]=substr($tmp_plus_seq[$k],$tmp_length);
		    $all_seq[$k].=$tmp_plus_seq[$k];
		}
            }
        }
    }else{
        for (my $i=$hash{$key}{"all"};$i>=0;$i--){##这里是每一个exon的循环
            my @tmp_minus_seq;
            my @minus_snps;
            if (exists $hash{$key}{"seq"}{$i}){
                for (my $m=0;$m<$indi_acc2;$m++){
		    $tmp_minus_seq[$m]="";
                    $tmp_minus_seq[$m]=$hash{$key}{"seq"}{$i};
                }
                foreach my $tmpkey (sort keys %{$hash{$key}{"snp"}{$i}}){###这里是每一个snp位置
                    foreach my $individual_tmp_key (sort keys %{$hash{$key}{"snp"}{$i}{$tmpkey}}){###这里是每一个snp位置有必要的时候，需要替换snp
                        my $my_tmp_minus_seq1=substr($tmp_minus_seq[$individual_tmp_key],0,$hash{$key}{"cnt"}{$i}{$tmpkey});
			my $my_tmp_minus_seq2=$hash{$key}{"snp"}{$i}{$tmpkey}{$individual_tmp_key};
			my $my_tmp_minus_seq3=substr($tmp_minus_seq[$individual_tmp_key],$hash{$key}{"cnt"}{$i}{$tmpkey}+1);
			$tmp_minus_seq[$individual_tmp_key]=$my_tmp_minus_seq1.$my_tmp_minus_seq2.$my_tmp_minus_seq3;
                    }
                }
                for (my $k=0;$k<$indi_acc2;$k++){
		    $tmp_minus_seq[$k]=reverse $tmp_minus_seq[$k];
		    $tmp_minus_seq[$k]=~ tr/ACGTacgt/TGCAtgca/;
		    $tmp_minus_seq[$k]=~ tr/MYKRmykr/KRMYkrmy/;
		    $tmp_minus_seq[$k]=~ tr/HVDBhvdb/DBHVdbhv/;
		    my $tmp_space="";
		    for (my $cb=0;$cb<$hash{$key}{'qstart'}{$i};$cb++){
			$tmp_space.="X";
		    }
		    $tmp_minus_seq[$k]=$tmp_minus_seq[$k].$tmp_space;
		    my $tmp_length=length($tmp_minus_seq[$k]); ##188
		    my $tmp_ori_le=length($tmpfile_hash1{"$key"});###210
		    if ($all_seq[$k] ne ""){
			if (length($tmp_minus_seq[$k])>$hash{$key}{'qstart'}{$i+1}){
			    my $tmp_min_le=$tmp_ori_le-$hash{$key}{'qstart'}{$i+1};
			    $all_seq[$k]=substr($all_seq[$k],0,$tmp_min_le).$tmp_minus_seq[$k];
			}else{
			    my $tmp_min_le=$tmp_ori_le-length($tmp_minus_seq[$k]);
			    $all_seq[$k]=substr($all_seq[$k],0,$tmp_min_le).$tmp_minus_seq[$k];
			}
		    }else{
			my $tmp_add_X=$tmp_ori_le-$tmp_length;
			my $add_x_seq="";
			for (my $adx=0;$adx<$tmp_add_X;$adx++){
			    $add_x_seq.= "X";
			}
			$all_seq[$k]=$add_x_seq.$tmp_minus_seq[$k];
		    }
                }
            }

        }
    }
    open (SEQOUT,">>$time_spot/$key.fasta") or die "wrong";
    print SEQOUT ">",$key,"\n";#, $plus_seq,"\n";

    my $tail_length=length($all_seq[0]);
    my $origi_length=length($tmpfile_hash1{$key});
    my $diff_tail_origi=$origi_length-$tail_length;
    if ($tail_length<$origi_length){
	for (my $k=0;$k<$indi_acc1;$k++){
	    print SEQOUT "$all_seq[$k]";
	    for (my $atl=0;$atl<$diff_tail_origi;$atl++){
		print SEQOUT "X";
	    }
	    print SEQOUT "\n";
	}
    }else{
	for (my $k=0;$k<$indi_acc1;$k++){
	    print SEQOUT "$all_seq[$k]\n";
	}
    }


    print SEQOUT $tmpfile_hash1{"$key"},"\n";
    close SEQOUT;

}
	exit;
    }else{
#	last; ###exitflag
	push(@all_pids, "$pid");
	while ($#all_pids>$cpu_th){
	    sleep 1;		
	    if($pid = waitpid(-1, WNOHANG)){
		shift @all_pids;
	    }
	}
    }
}

while ($#all_pids>-1){
    sleep 1;
    if($pid = waitpid(-1, WNOHANG)){
	shift @all_pids;
    }
}

print "The second Step Is Finished!\n";
#### step2 ############################################################################################

undef %hash;

if ($split_infile>0){
    $cmd="mkdir sub_"."$vcf_file3";
    system $cmd;
    $cmd="split -l $split_infile $blat_tmp3 -d -a 4 $vcf_file3"."_";
    system $cmd;
    $cmd="mv $vcf_file3"."_* sub_"."$vcf_file3";
    system $cmd;
}

$cmd="ls sub_".$vcf_file3;
$sub_file=`$cmd`;
undef @sub_files;
@sub_files=split/\n/,$sub_file;
undef @all_pids;
undef $pid;

foreach my $sub_blat_tmp3 (@sub_files){

    
    $pid=fork();
    
    if (not defined $pid){
	print "resources not avilable.\n";
    }elsif($pid==0){
	$sub_blat_tmp3="sub_".$vcf_file3."/$sub_blat_tmp3";    


open (FILE,"$sub_blat_tmp3") or die "wrong, can not open file $sub_blat_tmp3\n";
while (<FILE>){
#    last if $ll++>30; ### exitflag;
    chomp;
    my @lines=split/\s+/,$_; ##对blat结果进行分列
    if (exists $hash{"$lines[9]"}{"all"}){
	print TBBR $lines[9]," this IDs have presented twice, this time is ignored! $_\n";
	next;
    }

    my $chro=$lines[13];     ##
    my @blocksizes=split/\,/,$lines[18]; ###
    my @tstart=split/\,/,$lines[20]; ###
    my @qstart=split/\,/,$lines[19]; ###
    for (my $i=0;$i<=$#tstart;$i++){
        my $start=$tstart[$i]+1; ##每一段匹配的起始位置
        my $stop=$tstart[$i]+$blocksizes[$i];  ##每一段匹配的种质
        $hash{"$lines[9]"}{"all"}=$#tstart; #即有几个匹配，可能需要与有几个CDS对应
	$hash{"$lines[9]"}{"qstart"}{$i}=$qstart[$i];
	
	my $seq;  ##取出的每一段序列
	my @seq;  ##取出的每一段序列,分行
	my $tmp_seq;  ##取出的每一段序列的单纯序列，去掉了末尾
	my $rel;   ##在对应区间的物种1的所有SNP
	my @rel;   ##在对应区间的物种1的所有SNP,分行
	my $snps;  ##在对应区间内所有本物种SNP的每一次内容
	my @snp_lines; ##对SNP数据进行分析
	my $tmp_space;  ##SNP对应的位置，用于替换碱基 

        if ($lines[8] eq "-"){
            $cmd="$blastdbcmd -db $db_file3 -entry $chro -range $start-$stop";
	    print SNP_COLLECT "$lines[9]\t$cmd\n";
            $hash{"$lines[9]"}{"strand"}="minus";
            $hash{"$lines[9]"}{"cmd"}{$i}=$cmd;  ##每一段匹配的cmd
            $seq=`$cmd`;
            @seq=split/\n/,$seq;
            foreach (@seq){
                next if $_=~/>/;
                $tmp_seq.=$_;
            }
            $hash{"$lines[9]"}{"seq"}{$i}=$tmp_seq;
            $tmp_seq="";
            $cmd=qq[awk '{if (\$1 == "$chro" && \$2>=$start && \$2<=$stop){print \$0} }' $chro.$vcf_file3];
            $rel=`$cmd`;
	    print SNP_COLLECT "$lines[9]\t$cmd\n";
            @rel=split/\n/,$rel;
            $snps=0;
	    if (@rel){
		foreach (@rel){
		    $snps++;
		    @snp_lines=split/\s+/,$_;
		    $tmp_space=$snp_lines[1]-$start;
		    $hash{"$lines[9]"}{"cnt"}{$i}{$snps}=$tmp_space;
		    for (my $j=0;$j<$indi_acc3;$j++){
			$hash{"$lines[9]"}{"snp"}{$i}{$snps}{$j}=&given_snp($snp_lines[9+$j],@snp_lines);
		    }
		}
	    }
        }else{
            $cmd="$blastdbcmd -db $db_file3 -entry $chro -range $start-$stop";
	    print SNP_COLLECT "$lines[9]\t$cmd\n";
            $hash{"$lines[9]"}{"strand"}="plus";
            $hash{"$lines[9]"}{"cmd"}{$i}=$cmd;
            $seq=`$cmd`;
            @seq=split/\n/,$seq;
            foreach (@seq){
                next if $_=~/>/;
                $tmp_seq.=$_;
            }
            $hash{"$lines[9]"}{"seq"}{$i}=$tmp_seq;
            $tmp_seq="";
            $cmd=qq[awk '{if (\$1 == "$chro" && \$2>=$start && \$2<=$stop){print \$0} }' $chro.$vcf_file3];
	    print SNP_COLLECT "$lines[9]\t$cmd\n";
            $rel=`$cmd`;
            @rel=split/\n/,$rel;
            $snps=0;
	    if (@rel){
		foreach (@rel){
		    $snps++;
		    @snp_lines=split/\s+/,$_;
		    $tmp_space=$snp_lines[1]-$start;
		    $hash{"$lines[9]"}{"cnt"}{$i}{$snps}=$tmp_space;
		    for (my $j=0;$j<$indi_acc3;$j++){
			$hash{"$lines[9]"}{"snp"}{$i}{$snps}{$j}=&given_snp($snp_lines[9+$j],@snp_lines);
		    }
		}
	    }
        }
    }
}
close FILE;


foreach my $key (sort keys %hash){
    my @all_seq;
    for (my $k=0;$k<$indi_acc1;$k++){
	$all_seq[$k]="";
    }

    if ($hash{$key}{"strand"} eq "plus"){
        for (my $i=0;$i<=$hash{$key}{"all"};$i++){##这里是每一个exon的循环
            my @tmp_plus_seq;
            my @plus_snps;
            if (exists $hash{$key}{"seq"}{$i}){
                for (my $m=0;$m<$indi_acc3;$m++){
                    $tmp_plus_seq[$m]=$hash{$key}{"seq"}{$i};
                }
                foreach my $tmpkey (sort keys %{$hash{$key}{"snp"}{$i}}){###这里是每一个snp位置
                    foreach my $individual_tmp_key (sort keys %{$hash{$key}{"snp"}{$i}{$tmpkey}}){###这里是每一个snp位置有必要的时候，需要替换snp
			my $my_tmp_plus_seq1=substr($tmp_plus_seq[$individual_tmp_key],0,$hash{$key}{"cnt"}{$i}{$tmpkey});
			my $my_tmp_plus_seq2=$hash{$key}{"snp"}{$i}{$tmpkey}{$individual_tmp_key};
			if (not $my_tmp_plus_seq2){
			    print "$key,$i,$tmpkey,$individual_tmp_key,AAAAAAAAA \n";
			    exit;
			}
			my $my_tmp_plus_seq3=substr($tmp_plus_seq[$individual_tmp_key],$hash{$key}{"cnt"}{$i}{$tmpkey}+1);
			$tmp_plus_seq[$individual_tmp_key]=$my_tmp_plus_seq1.$my_tmp_plus_seq2.$my_tmp_plus_seq3;
                    }
                }
                for (my $k=0;$k<$indi_acc3;$k++){ 
		    my $tmp_space="";
		    for (my $cb=0;$cb<$hash{$key}{'qstart'}{$i};$cb++){
			$tmp_space.="X";
		    }
		    $tmp_plus_seq[$k]=$tmp_space.$tmp_plus_seq[$k];
		    my $tmp_length=length($all_seq[$k]);
		    $tmp_plus_seq[$k]=substr($tmp_plus_seq[$k],$tmp_length);
		    $all_seq[$k].=$tmp_plus_seq[$k];
                }
            }
        }
    }else{
        for (my $i=$hash{$key}{"all"};$i>=0;$i--){##这里是每一个exon的循环
            my @tmp_minus_seq;
            my @minus_snps;
            if (exists $hash{$key}{"seq"}{$i}){
                for (my $m=0;$m<$indi_acc3;$m++){
		    $tmp_minus_seq[$m]="";
                    $tmp_minus_seq[$m]=$hash{$key}{"seq"}{$i};
                }
                foreach my $tmpkey (sort keys %{$hash{$key}{"snp"}{$i}}){###这里是每一个snp位置
                    foreach my $individual_tmp_key (sort keys %{$hash{$key}{"snp"}{$i}{$tmpkey}}){###这里是每一个snp位置有必要的时候，需要替换snp
                        my $my_tmp_minus_seq1=substr($tmp_minus_seq[$individual_tmp_key],0,$hash{$key}{"cnt"}{$i}{$tmpkey});
			my $my_tmp_minus_seq2=$hash{$key}{"snp"}{$i}{$tmpkey}{$individual_tmp_key};
			my $my_tmp_minus_seq3=substr($tmp_minus_seq[$individual_tmp_key],$hash{$key}{"cnt"}{$i}{$tmpkey}+1);
			$tmp_minus_seq[$individual_tmp_key]=$my_tmp_minus_seq1.$my_tmp_minus_seq2.$my_tmp_minus_seq3;
                    }
                }
                for (my $k=0;$k<$indi_acc3;$k++){
		    $tmp_minus_seq[$k]=reverse $tmp_minus_seq[$k];
		    $tmp_minus_seq[$k]=~ tr/ACGTacgt/TGCAtgca/;
		    $tmp_minus_seq[$k]=~ tr/MYKRmykr/KRMYkrmy/;
		    $tmp_minus_seq[$k]=~ tr/HVDBhvdb/DBHVdbhv/;
		    my $tmp_space="";
		    for (my $cb=0;$cb<$hash{$key}{'qstart'}{$i};$cb++){
			$tmp_space.="X";
		    }
		    $tmp_minus_seq[$k]=$tmp_minus_seq[$k].$tmp_space;
		    my $tmp_length=length($tmp_minus_seq[$k]); ##188
		    my $tmp_ori_le=length($tmpfile_hash1{"$key"});###210
		    if ($all_seq[$k] ne ""){
			if (length($tmp_minus_seq[$k])>$hash{$key}{'qstart'}{$i+1}){
			    my $tmp_min_le=$tmp_ori_le-$hash{$key}{'qstart'}{$i+1};
			    $all_seq[$k]=substr($all_seq[$k],0,$tmp_min_le).$tmp_minus_seq[$k];
			}else{
			    my $tmp_min_le=$tmp_ori_le-length($tmp_minus_seq[$k]);
			    $all_seq[$k]=substr($all_seq[$k],0,$tmp_min_le).$tmp_minus_seq[$k];
			}
		    }else{
			my $tmp_add_X=$tmp_ori_le-$tmp_length;
			my $add_x_seq="";
			for (my $adx=0;$adx<$tmp_add_X;$adx++){
			    $add_x_seq.= "X";
			}
			$all_seq[$k]=$add_x_seq.$tmp_minus_seq[$k];
		    }
                }
            }

        }
    }
    open (SEQOUT,">>$time_spot/$key.fasta") or die "wrong";
    print SEQOUT ">",$key,"\n";#, $plus_seq,"\n";

    my $tail_length=length($all_seq[0]);
    my $origi_length=length($tmpfile_hash1{$key});
    my $diff_tail_origi=$origi_length-$tail_length;
    if ($tail_length<$origi_length){
	for (my $k=0;$k<$indi_acc1;$k++){
	    print SEQOUT "$all_seq[$k]";
	    for (my $atl=0;$atl<$diff_tail_origi;$atl++){
		print SEQOUT "X";
	    }
	    print SEQOUT "\n";
	}
    }else{
	for (my $k=0;$k<$indi_acc1;$k++){
	    print SEQOUT "$all_seq[$k]\n";
	}
    }

    print SEQOUT $tmpfile_hash1{"$key"},"\n";
    close SEQOUT;

}

	exit;
    }else{
#	last;###exitflag
	push(@all_pids, "$pid");
	while ($#all_pids>$cpu_th){
	    sleep 1;		
	    if($pid = waitpid(-1, WNOHANG)){
		shift @all_pids;
	    }
	}
    }
}

while ($#all_pids>-1){
    sleep 1;
    if($pid = waitpid(-1, WNOHANG)){
	shift @all_pids;
    }
}

close TBBR;
close SNP_COLLECT;
print "The Third Step Is Finished!\n";

#### step 3 ########################################################################################

















### sub start ##################################################################################

sub given_snp(){
    my ($snp,@my_snp_line)=@_;
    my @snp=split/\:/,$snp;
    my $final_snp;
    my @tmp_snp;
    my @array;


    
    if ($my_snp_line[8]=~/INDEL/ or $my_snp_line[7]=~/INDEL/){
	$final_snp="I";# if length($my_snp_line[3])>1;	
    }else{
	if ($snp[0] eq "0/0"){
	    $final_snp=$my_snp_line[3];
	}elsif($snp[0] eq "./."){
	    $final_snp="N";	    
	}else{
	    if ($my_snp_line[4]=~/\,/){
		@tmp_snp=split/\,/,$my_snp_line[4];
		if ($snp[0] eq "1/1"){
		    $final_snp=$tmp_snp[0];
		}elsif($snp[0] eq "0/1"){
		    @array=("$my_snp_line[3]","$tmp_snp[0]");
		    $final_snp=&DNA_oligo(@array);
		}elsif($snp[0] eq "0/2"){
		    @array=("$my_snp_line[3]","$tmp_snp[1]");
		    $final_snp=&DNA_oligo(@array);
		}elsif($snp[0] eq "1/2"){
		    @array=("$tmp_snp[0]","$tmp_snp[1]");
		    $final_snp=&DNA_oligo(@array);
		}elsif($snp[0] eq "2/2"){
		    $final_snp=$tmp_snp[1];
		}else{
		    print "type is not belong to GT, something is wrong here!\n";
		    exit;
		}
	    }else{
		if ($snp[0] eq "1/1"){
		    $final_snp=$my_snp_line[4];
		}else{
		    @array=("$my_snp_line[3]","$my_snp_line[4]");
		    $final_snp=&DNA_oligo(@array);
		}
	    }
	}
    }
    return $final_snp;
}


sub DNA_oligo(){
    my @dna=@_;
    my $dna;
    my $tmpdna;
    @dna=sort (@dna);
    $dna=join("",@dna);

    if ($dna eq "AC"){
        $tmpdna="M";
    }
    if ($dna eq "CT"){
        $tmpdna="Y";
    }
    if ($dna eq "GT"){
        $tmpdna="K";
    }
    if ($dna eq "CG"){
        $tmpdna="S";
    }
    if ($dna eq "AG"){
        $tmpdna="R";
    }
    if ($dna eq "AT"){
        $tmpdna="W";
    }

    if ($dna eq "ACT"){
        $tmpdna="H";
    }
    if ($dna eq "ACG"){
        $tmpdna="V";
    }
    if ($dna eq "AGT"){
        $tmpdna="D";
    }
    if ($dna eq "CGT"){
        $tmpdna="B";
    }
#    if ($dna eq "ACGT"){
#        $tmpdna="N";
#    }

    return $tmpdna;
}


sub oligo_DNA(){
    my ($dna)=@_;
    my @dna;
    if ($dna eq "M"){
	@dna=("A","C");
    }
    if ($dna eq "W"){
	@dna=("A","T");
    }
    if ($dna eq "Y"){
	@dna=("T","C");
    }
    if ($dna eq "K"){
	@dna=("G","T");
    }
    if ($dna eq "S"){
	@dna=("G","C");
    }
    if ($dna eq "R"){
	@dna=("A","G");
    }
    if ($dna eq "H"){
	@dna=("A","T","C");
    }
    if ($dna eq "V"){
	@dna=("A","G","C");
    }
    if ($dna eq "D"){
	@dna=("A","T","G");
    }
    if ($dna eq "B"){
	@dna=("G","T","C");
    }
#    if ($dna eq "N"){
#	@dna=("A","G","T","C");
#    }

    return @dna;
}


sub codon_table
{
    my %codon;
    $codon{"TTT"}="F";
    $codon{"TTC"}="F";
    $codon{"TTA"}="L";
    $codon{"TTG"}="L";
    $codon{"CTA"}="L";
    $codon{"CTG"}="L";
    $codon{"CTT"}="L";
    $codon{"CTC"}="L";
    $codon{"ATA"}="I";
    $codon{"ATC"}="I";
    $codon{"ATT"}="I";
    $codon{"ATG"}="M";
    $codon{"GTA"}="V";
    $codon{"GTC"}="V";
    $codon{"GTT"}="V";
    $codon{"GTG"}="V";
    $codon{"TCA"}="S";
    $codon{"TCT"}="S";
    $codon{"TCG"}="S";
    $codon{"TCC"}="S";
    $codon{"CCT"}="P";
    $codon{"CCA"}="P";
    $codon{"CCC"}="P";
    $codon{"CCG"}="P";
    $codon{"ACA"}="T";
    $codon{"ACT"}="T";
    $codon{"ACG"}="T";
    $codon{"ACC"}="T";
    $codon{"GCT"}="A";
    $codon{"GCA"}="A";
    $codon{"GCC"}="A";
    $codon{"GCG"}="A";
    $codon{"TAT"}="Y";
    $codon{"TAC"}="Y";
    $codon{"TAA"}="*";
    $codon{"TAG"}="*";
    $codon{"CAT"}="H";
    $codon{"CAC"}="H";
    $codon{"CAA"}="Q";
    $codon{"CAG"}="Q";
    $codon{"AAT"}="N";
    $codon{"AAC"}="N";
    $codon{"AAG"}="K";
    $codon{"AAA"}="K";
    $codon{"GAT"}="D";
    $codon{"GAC"}="D";
    $codon{"GAG"}="E";
    $codon{"GAA"}="E";
    $codon{"TGG"}="W";
    $codon{"TGA"}="*";
    $codon{"TGC"}="C";
    $codon{"TGT"}="C";
    $codon{"CGT"}="R";
    $codon{"CGC"}="R";
    $codon{"CGG"}="R";
    $codon{"CGA"}="R";
    $codon{"AGA"}="R";
    $codon{"AGG"}="R";
    $codon{"AGT"}="S";
    $codon{"AGC"}="S";
    $codon{"GGG"}="G";
    $codon{"GGC"}="G";
    $codon{"GGA"}="G";
    $codon{"GGT"}="G";
    return %codon;
}

sub read_fasta_file_new()
{
#必须输入2个参数，分别为文件名，以及ID分隔符
    my %hash;
    my ($file_name,$sep)=@_;
    my $aa;
    my $id;
    my @lines;
    open (PEP,"$file_name") or die "wrong";
    my $first_id=<PEP>;
    chomp $first_id;
    $first_id=~s/>//;
    if ($sep eq "dot"){
	@lines=split/\./,$first_id;
    }elsif($sep eq "shu"){
	@lines=split/\|/,$first_id;
    }elsif($sep eq "space"){
	@lines=split/\s/,$first_id;
    }elsif($sep eq "join"){
	@lines=split/\s+|\t/,$first_id;
	my $tmp=join("-",@lines);
	$lines[0]=$tmp;
    }else{
	@lines=($first_id);
	#$lines[0]=$first_id;
    }
    
    $id=$lines[0];
#    print "****$id***\n";
    while (<PEP>){
	chomp;
	if ($_=~s/>//){
	    $aa=~s/\*//;
	    $hash{$id}=$aa;
	    $aa="";    
##################不同的文件格式，对这一部分需要微调#############################
	    if ($sep eq "dot"){
		@lines=split/\./,$_;
	    }elsif($sep eq "shu"){
		@lines=split/\|/,$_;
	    }elsif($sep eq "space"){
		@lines=split/\s/,$_;
	    }elsif($sep eq "join"){
		@lines=split/\s+|\t/,$_;
		my $tmp=join("-",@lines);
		$lines[0]=$tmp;
	    }else{
		@lines=($_);
	    }
	    $id=$lines[0];
###############################################
	}else{
	    $aa.=$_;
	}
    }

    $aa=~s/\*//;
#    print "**$id***$aa***\n";
    $hash{$id}=$aa;
    return %hash;
}
