This repository has been archived by the owner on Jun 12, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tQN_generate_normalized_data.pl
156 lines (119 loc) · 4.2 KB
/
tQN_generate_normalized_data.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/perl -
use strict;
use File::Spec;
use Getopt::Long;
use IO::File;
use Pod::Usage;
# initialize user-definable parameters to default values
my $output_dir='';
my $output_format='PennCNV';
my $all_samples='';
#my $suffix='';
# set user defined parameters
GetOptions('output_directory=s' => \$output_dir,
'output_format=s' => \$output_format,
'samples_list=s' => \$all_samples
)or pod2usage(1);
$output_dir=File::Spec->canonpath($output_dir);
unless((-d $output_dir)) {
print STDERR "Error: Cannot access directory for storing output: " .
"$output_dir\n";
exit(0);
}
# Generate normalized data in the specified output format
print "\nGenerate normalized data in the specified output format\n";
my $sample_file=File::Spec->canonpath($all_samples);
open(SAMPLES,"$sample_file") ||
die "Error: Cannot open sample list file: $sample_file\n";
my $sample_header=<SAMPLES>;
chomp($sample_header);
my $ok_sample_header="Assay\tFilename\tIGV_index";
if($sample_header ne $ok_sample_header) {
print STDERR "Error: Unexpected file format in sample list file $sample_file: " .
"The header is not $ok_sample_header\n";
exit(0);
}
my @samples;
my $nsamp=0;
while(my $line=<SAMPLES>) {
chomp($line);
my @items=split(/\t/,$line);
push(@samples,$items[0]);
$nsamp++;
}
close(SAMPLES);
# A single file per sample
# In addition a file samples_names.txt is generated which is used by BAFsegmentation
if($output_format eq 'PennCNV' || $output_format eq 'QuantiSNP' ||
$output_format eq 'BAFsegmentation') {
my $progress = 1;
foreach my $sample (@samples) {
print " Processing sample ".$sample." (".$progress."/".$nsamp.")\n";
my $filename=File::Spec->canonpath("$output_dir/${sample}_tQN.txt");
my $fh=IO::File->new("$filename");
if(!defined($fh)) {
print "WARNING: Cannot open sample file: $filename\n";
}
else{
$progress++;
my $outfile=File::Spec->canonpath("$output_dir/${sample}_tQN_${output_format}.txt");
open(OUTPUT,">$outfile");
# PennCNV
my $headerline="Name\tChr\tPosition\t${sample}.B Allele Frequency\t" .
"${sample}.Log R Ratio";
print OUTPUT "$headerline\n";
my $header=<$fh>;
chomp($header);
my $ok_header="Name\tChr\tPosition\ttQN B Allele Frequency\ttQN Log R Ratio\ttQN X\ttQN Y";
if($header ne $ok_header) {
print STDERR "Error: Incorrect column headers in: $filename\n" .
"expected: $ok_header\n";
exit(0);
}
while(my $line=<$fh>) {
chomp($line);
my @items=split("\t",$line,-1);
if($items[3] ne "NA" && $items[4] ne "NA") {
# Penn CNV and BAFsegmentation format
my $outline="$items[0]\t$items[1]\t$items[2]\t$items[3]\t$items[4]";
if($output_format eq 'QuantiSNP') {
$outline="$items[0]\t$items[1]\t$items[2]\t$items[4]\t$items[3]";
}
print OUTPUT "$outline\n";
}
}
close(OUTPUT);
close($fh);
}
}
}
print "Analysis finished\n";
# End of main program
# Documentation
=pod
=head1 NAME
tQN_normalize_samples.pl - normalize samples using tQN
=head1 SYNOPSIS
tQN_normalize_samples.pl --beadchip=<name> [--input_directory=<name>] [--output_directory=<name>] [--output_format=<name>]
=head1 OPTIONS
=over 8
=item B<--beadchip=<name>>
Specify which BeadChip the whole genome genotyping data was generated
on. Supported BeadChips include "humanhap550" and "humancnv370-duo". For a
complete list of currently supported BeadChips check for which
BeadChips there are cluster files in the "lib" directory.
=item B<--input_directory=<name>>
Specify a directory with files for individual samples with BeadStudio
data. The directory should contain a file "sample_names.txt" with a
list of samples to analyse. Files in this directory can be generated
using "split_beadstudio_samples.pl". Default is "extracted".
=item B<--output_directory=<name>>
Specify a directory to store the generated files in. Default is "normalized".
=item B<--output_format=<name>>
Specify the format of the generated normalized data. Alternatives are "BeadStudio", "PennCNV", "QuantiSNP", and "BAFsegmentation". Default is "BeadStudio".
=back
=head1 AUTHORS
Markus Ringner
Please report bugs to markus.ringner@med.lu.se
=cut
# End of documentation