脚本2:
perl /mnt/sdb/share/bin/deal_fa.pl
数据路径:/mnt/sdb/share/bin/linux-demo/program/fasta
要求:
【以下回答由 GPT 生成】
对于上述任务,你可以使用Perl编写一个脚本来完成。下面是一个可以实现这些功能的Perl脚本的示例代码:
```perl
use strict; use warnings;
sub filter_sequences { my ($input_file, $output_file) = @_; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; my $sequence = ""; my $header = ""; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { if ($sequence ne "" && length($sequence) >= 1000) { print $out_fh "$header\n$sequence\n"; } $header = $line; $sequence = ""; } else { $sequence .= $line; } } if ($sequence ne "" && length($sequence) >= 1000) { print $out_fh "$header\n$sequence\n"; } close($in_fh); close($out_fh); }
sub extract_sequences { my ($input_file, $region_file, $output_file) = @_; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; open(my $region_fh, "<", $region_file) or die "Cannot open file: $!"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; my %regions; while (my $region_line = <$region_fh>) { chomp($region_line); my ($region_start, $region_end) = split("\t", $region_line); $regions{$region_start} = $region_end; } my $current_seq = ""; my $current_header = ""; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { if ($current_seq ne "" && exists $regions{$current_header}) { my $end_position = $regions{$current_header} > length($current_seq) ? length($current_seq) : $regions{$current_header}; print $out_fh "$current_header\n", substr($current_seq, 0, $end_position), "\n"; } $current_header = $line; $current_seq = ""; } else { $current_seq .= $line; } } if ($current_seq ne "" && exists $regions{$current_header}) { my $end_position = $regions{$current_header} > length($current_seq) ? length($current_seq) : $regions{$current_header}; print $out_fh "$current_header\n", substr($current_seq, 0, $end_position), "\n"; } close($in_fh); close($region_fh); close($out_fh); }
sub calculate_lengths { my ($input_file) = @_; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; my %lengths; my $current_header = ""; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { $current_header = $line; $lengths{$current_header} = 0; } else { $lengths{$current_header} += length($line); } } close($in_fh); return \%lengths; }
sub calculate_gc_content { my ($input_file, $window_size) = @_; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; my %gc_content; my $current_header = ""; my $current_seq = ""; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { if ($current_header ne "") { for (my $i = 0; $i < length($current_seq); $i += $window_size) { my $window_sequence = substr($current_seq, $i, $window_size); my $gc_count = ($window_sequence =~ tr/GC//); my $window_start = $i + 1; my $window_end = $i + $window_size; push(@{$gc_content{$current_header}}, "$window_start-$window_end: " . ($gc_count / $window_size) * 100 . "%"); } } $current_header = $line; $current_seq = ""; } else { $current_seq .= $line; } } if ($current_header ne "") { for (my $i = 0; $i < length($current_seq); $i += $window_size) { my $window_sequence = substr($current_seq, $i, $window_size); my $gc_count = ($window_sequence =~ tr/GC//); my $window_start = $i + 1; my $window_end = $i + $window_size; push(@{$gc_content{$current_header}}, "$window_start-$window_end: " . ($gc_count / $window_size) * 100 . "%"); } } close($in_fh); return \%gc_content; }
sub calculate_basic_stats { my ($input_file) = @_; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; my @sequence_lengths; my $total_length = 0; my $current_length = 0; my $current_header = ""; my $max_length = 0; my $min_length; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { if ($current_length > 0) { push(@sequence_lengths, $current_length); $total_length += $current_length; if ($current_length > $max_length) { $max_length = $current_length; } if (!defined $min_length || $current_length < $min_length) { $min_length = $current_length; } } $current_header = $line; $current_length = 0; } else { $current_length += length($line); } } if ($current_length > 0) { push(@sequence_lengths, $current_length); $total_length += $current_length; if ($current_length > $max_length) { $max_length = $current_length; } if (!defined $min_length || $current_length < $min_length) { $min_length = $current_length; } } close($in_fh);
@sequence_lengths = sort { $b <=> $a } @sequence_lengths;
my $n50 = calculate_n(50, \@sequence_lengths, $total_length);
my $n90 = calculate_n(90, \@sequence_lengths, $total_length);
return ($max_length, $min_length, $n50, $n90);
}
sub calculate_n { my ($percentage, $lengths, $total_length) = @_; my $threshold = $total_length * ($percentage / 100); my $current_length = 0; foreach my $length (@{$lengths}) { $current_length += $length; if ($current_length >= $threshold) { return $length; } } }
sub join_sequences { my ($input_file, $output_file) = @_; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; my $current_seq = ""; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { if ($current_seq ne "") { print $out_fh join("", ("N") x 300), "\n$current_seq\n"; } $current_seq = ""; } else { $current_seq .= $line; } } if ($current_seq ne "") { print $out_fh join("", ("N") x 300), "\n$current_seq\n"; } close($in_fh); close($out_fh); }
sub split_sequences { my ($input_file) = @; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; my $output_count = 1; my $output_file = "contig$output_count.fasta"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; while (my $line = <$in_fh>) { if ($line =~ /^>/) { close($out_fh); $output_count++; $output_file = "contig_$output_count.fasta"; open($out_fh, ">", $output_file) or die "Cannot open file: $!"; } print $out_fh $line; } close($in_fh); close($out_fh); }
sub split_files { my ($input_file) = @; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; my $output_count = 1; my $current_header = ""; my $output_file = "output$output_count.fasta"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; while (my $line = <$in_fh>) { if ($line =~ /^>/) { if ($current_header ne "") { close($out_fh); $output_count++; $output_file = "output_$output_count.fasta"; open($out_fh, ">", $output_file) or die "Cannot open file: $!"; } $current_header = $line; } print $out_fh $line; } close($in_fh); close($out_fh); }
sub reverse_complement { my ($input_file, $output_file) = @_; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { print $out_fh "$line\n"; } else { $line = reverse($line); $line =~ tr/ACGTacgt/TGCAtgca/; print $out_fh "$line\n"; } } close($in_fh); close($out_fh); }
sub convert_sequences { my ($input_file, $output_file) = @; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { $line =~ s//-/g; } print $out_fh "$line\n"; } close($in_fh); close($out_fh); }
sub convert_to_fastq { my ($input_file, $output_file) = @_; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; my $sequence = ""; my $header = ""; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { if ($sequence ne "") { $header =~ s/^>//; print $out_fh '@', $header, "\n", $sequence, "\n+\n", ("I" x length($sequence)), "\n"; } $header = $line; $sequence = ""; } else { $sequence .= $line; } } if ($sequence ne "") { $header =~ s/^>//; print $out_fh '@', $header, "\n", $sequence, "\n+\n", ("I" x length($sequence)), "\n"; } close($in_fh); close($out_fh); }
sub modify_prefix { my ($input_file, $output_file) = @; open(my $in_fh, "<", $input_file) or die "Cannot open file: $!"; open(my $out_fh, ">", $output_file) or die "Cannot open file: $!"; while (my $line = <$in_fh>) { chomp($line); if ($line =~ /^>/) { $line =~ s/^>.+?/>part_/; } print $out_fh "$line\n"; } close($in_fh); close($out_fh); }
filter_sequences("input.fasta", "filtered.fasta");
extract_sequences("input.fasta", "region.txt", "extracted.fasta");
my $lengths = calculate_lengths("input.fasta"); foreach my $header (keys %{ $lengths }) { print "Length of $header: ", $lengths->{$header}, "bp\n"; }
my $gc_content = calculate_gc_content("input.fasta", 100); foreach my $header (keys %{ $gc_content }) { print "GC Content of $header:\n"; foreach my $window (@{ $gc_content->{$header} }) { print "$window\n"; } }
my ($max_length, $min_length, $n50, $n90) = calculate_basic_stats("input.fasta"); print "Max Length: $max_length\n"; print "Min Length: $min_length\n"; print "N50: $n50\n"; print "N90: $n90\n";
join_sequences("input.fasta", "joined.fasta");
split_sequences("input.fasta");
split_files("input.fasta");
reverse_complement("input.fasta", "reverse_complemented.fasta");
convert_sequences("input.fasta", "converted.fasta");
convert_to_fastq("input.fasta", "output.fastq");