#!/usr/local/bin/perl use Statistics::Basic::Mean; use Statistics::Basic::StdDev; use IO::File; use Tk; # input files my $cel_dir; my $probeset_gene_file; my $expfile; my $chipidfile; my $parental_A_chips_file; my $parental_B_chips_file; # program parameters my $spread = 2.0; my $distortion = 80; my $unassigned = 10; my $output; my $use_cd = "NO"; # create main window my $mw = MainWindow->new(-background=>'black'); $mw->geometry('600x600'); $mw->resizable(0,0); # create frame for menubar my $menu_frame = $mw->Frame(-background=>'black'); # give column 0 all unused space $menu_frame->gridColumnconfigure(1, -weight => 1); # create menu bar buttons my $mbf = $menu_frame->Menubutton(-text => 'File', -relief => 'raised', -background=>'white', -foreground=>'black'); my $mbp = $menu_frame->Menubutton(-text => 'Help', -relief => 'raised', -background=>'white', -foreground=>'black'); # create commands for file button $mbf->command( -label => 'Select Folder Background Corrected .CEL files', -command => [\&dirOpen]); $mbf->command( -label => 'Select Conversion File', -command => [\&fileOpen, "conversion"]); $mbf->command( -label => 'Select Experimental Setup File', -command => [\&fileOpen, "setup"]); $mbf->command( -label => 'Select Chip List File', -command => [\&fileOpen, "chiplist"]); $mbf->command( -label => 'Select Parent A Chip List File', -command => [\&fileOpen, "parent_A"]); $mbf->command( -label => 'Select Parent B Chip List File', -command => [\&fileOpen, "parent_B"]); $mbf->separator(); $mbf->command(-label => 'Exit', -command => sub {exit;} ); # create text for help menu my $help_bgcorrection = qq(SFPscan Help - Background correction .CEL files\n\n\n The .CEL files need to be background corrected seperately under the R programming environment, with the Bioconductor package installed. Use the following R-script: copy and paste this code, all lines between START R-SCRIPT and END R-SCRIPT, in a text file and then load it into the R-programming environment. Alternatively, you can download this R-script at: http://elp.ucdavis.edu/data/analysis/sfp_map/SFPscan.html The obtained background corrected .CEL files should have a format similar to: "471.CEL" "244901_at1",363.582352941176 "244901_at2",124.111764705882 ...etc These background corrected .CEL files should all be in one folder, which then has to be selected from the 'File' menu. ############################### START R-SCRIPT ########################### ########################################################################## # # Author: Hans van Leeuwen # Department of Plant Sciences # University of California, Davis # Mail Stop 3 # One Shields Ave # Davis, CA 95616 # # Email: hvanleeuwen\@ucdavis.edu # # Date: JUNE 2005 # ########################################################################## ########################################################################## ######################### IMPORTANT ###################################### ########################################################################## ########### THE LINES MARKED WITH 'CHANGE' MUST ######################### ########### BE MODIFIED USING YOUR PATH ######################### ########################################################################## ######################### IMPORTANT ###################################### ########################################################################## ########################################################################## # Set work directory to the directory where your original .CEL # files are located setwd("C:/rw1081/hans_temp") ######### CHANGE ######## ########################################################################## # Read in file with chip ID's for which the CEL files will be read later # This file should have format: # R5.CEL # R85.CEL # R221.CEL # R277.CEL # R30.CEL # R82.CEL # R270.CEL # # etc... # ##### This file should have the filename 'chips.txt' ##################### chips <- read.table("chips.txt") ########################################################################## # Load affy package library(affy) ########################################################################## # Open cel files one by one for (i in 1:length(chips$V1)){ # Set work directory to the directory where your .CEL files are located setwd("C:/rw1081/hans_temp") ######### CHANGE ######## data <- ReadAffy(filenames = as.character(chips$V1[i])) data.bg <- bg.correct.rma(data) data.bg.matrix <- pm(data.bg) # Set work directory to the directory where the background corrected files # will be saved. This can not be the same work directory that contains the # original .CEL files!! setwd("C:/rw1081/hans_temp/bg_files") ######### CHANGE ######## write.table(data.bg.matrix, file = as.character(chips$V1[i]), append = FALSE, quote = TRUE, sep = ",", eol = "\\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE, qmethod = c("escape", "double")) } # end for ################################# END R-SCRIPT ############################# ); my $help_inputfiles = qq(SFPscan Help - Input files\n\n\n Note: examples of all input files can be downloaded at: http://elp.ucdavis.edu/data/analysis/sfp_map/sfp_map.html [CONVERSION FILE] The conversion file with the probeset and gene code data, having the format: Probe Set ID Sequence Derived From 245016_at accD 261585_at At1g01010 261568_at At1g01030 261584_at At1g01040 ...etc This file is provided with this script: "ATH1_conv-probeset-genecode.txt" Note: Positions in the Col-0 genome of the probeset sequences from the Affymetrix ATH1 GeneChip were obtained from the Arabidopsis annotation Version 4, TIGR release May 2003. [EXPERIMENTAL SETUP FILE] A file with the experimental setup, first column is RIL code, next columns, rep chip codes. First line should be a header line starting with ; (no space between ';' and 'RILcode'). For example: ;RILcode SW1 SW2 4 R5 R620 5 R85 R687 8 R221 R495b 13 R30 R666 [CHIP LIST FILE] A file with a list of chip files that will be used in this study. this file should contain both the RIL chip file names and the parental chip file names. File should have following format: R5.CEL R85.CEL R221.CEL R30.CEL R82.CEL R270.CEL R76.CEL etc. [PARENT A CHIP LIST FILE] [PARENT B CHIP LIST FILE] Two files with the chip file names of the parental chips. File should have following format: R5.CEL R85.CEL R221.CEL R30.CEL etc. ); my $help_options = qq(SFPscan Help - Program options\n\n\n [GAP SIZE] The minimum distance between the two peaks in the bimodal distribution. This value is calculated as follows: min_SFPdev SFP parent / max_SFPdev nonSFP parent. The default value is 2.0 A value greater than 1.0 means that the distributions are seperated. This value should be chosen carefully; a too low value may increase the number of markers, but can also increase the number of unassigned alleles; a too high value may give higher quality markers, but the number of markers obtained decreases. [ALLELE DISTORTION] In a RIL population a allele segregation of 50%-50% is expected, but this can vary in certain populations. The default value is 80. This means that allele segregations from 80%-20% to 20%-80% are allowed. [UNASSIGNED SCORES] This sets a limit to the number of unassigned allele scores a marker may have. The default value is 10. This means that a marker is discarded if more than 10% of the allele calls are unassigned. [OUTPUT] This is a short text (please use numbers, letters and underscores only) that will be used in the output file names. [USE CD] You have the option to use 'C' and 'D' scores in genotyping (C = NOT A, D = NOT B). If you select this option the following genotyping rules will be used: If half or more than half of the rep chips is genotyped as one parental accession, and the other rep chips as unassigned: use C or D. Type 'YES' to select this option. If you do not select this option the above mentioned chips will be scored as '-'. Type 'NO' if you do not want the 'C' and 'D' scoring to be used. ); my $help_genotyping = qq(SFPscan Help - Genotyping Rules\n\n\n The following rules are used when determining a genotype for a RIL, based on the genotypes of the individual replicate chips: A = parent A, B = parent B, C = NOT A, D = NOT B, - = unassigned If all rep chips are genotyped as one accession: genotype as 'A' or 'B' If half of the rep chips is genotyped for one accession, and the other half of the rep chips for the other accession: genotype as '-' (missing data) If half or more than half of the rep chips is genotyped as one accession, and the other rep chips as unassigned: genotype as 'C' or 'D' Note: The C and D scoring method can be switched off (see Program Options). When switched of the above cases will be genotyped as '-' (missing data). Else, genotype as '-' (missing data) ); my $help_about = "SFPscan Help - About SFPscan\n\n\nSFPscan Version 10\n\nAuthor:\n\tHans van Leeuwen\n\tDept. of Plant Sciences\n\tUniversity of California, Davis\n\tMail Stop 3\n\tOne Shields Ave\n\tDavis, CA 95616\n\tE-mail: hvanleeuwen\@ucdavis.edu, hans_van_leeuwen\@hotmail.com\n\nDate: September 2005\n\nSFPscan Web site:\n\thttp://elp.ucdavis.edu/data/analysis/sfp_map/SFPscan.html"; # create commands $mbp->command( -label => 'Background correction .CEL files', -command => [\&makeTop, "SFPscan Help - Background correction .CEL files", $help_bgcorrection]); $mbp->command( -label => 'Input files', -command => [\&makeTop, "SFPscan Help - Input files", $help_inputfiles]); $mbp->command( -label => 'Program options', -command => [\&makeTop, "SFPscan Help - Program options", $help_options]); $mbp->command( -label => 'Genotyping Rules', -command => [\&makeTop, "SFPscan Help - Genotyping Rules", $help_genotyping]); $mbp->command( -label => 'About SFPscan', -command => [\&makeTop, "SFPscan Help - About SFPscan", $help_about]); # create frame for parameters my $frame_parameters = $mw -> Frame(-background=>'black'); # create entry widgets for parameters my $spread_label = $frame_parameters -> Label(-text=>"Gap: "); $spread_label->configure(-background=>'black', -foreground=>'white'); my $ent_spread = $frame_parameters -> Entry(-textvariable =>\$spread); my $distortion_label = $frame_parameters -> Label(-text=>"Allele frequency distortion: "); $distortion_label->configure(-background=>'black', -foreground=>'white'); my $ent_distortion = $frame_parameters -> Entry(-textvariable =>\$distortion); my $unassigned_label = $frame_parameters -> Label(-text=>"Missing data: "); $unassigned_label->configure(-background=>'black', -foreground=>'white'); my $ent_unassigned = $frame_parameters -> Entry(-textvariable =>\$unassigned); my $output_label = $frame_parameters -> Label(-text=>"Output: "); $output_label->configure(-background=>'black', -foreground=>'white'); my $ent_output = $frame_parameters -> Entry(); # Use C-D option radiobutton widget my $label_cd = $frame_parameters -> Label(-text=>"Use C-D in genotyping? "); $label_cd->configure(-background=>'black', -foreground=>'white'); my $radio_no = $frame_parameters -> Radiobutton(-text=>"NO ", -value=>"NO", -variable=>\$use_cd, -background=>'black', -foreground=>'white', -activebackground=>'black', -activeforeground=>'white', -selectcolor=>'black'); $radio_no->select(); my $radio_yes = $frame_parameters -> Radiobutton(-text=>"YES", -value=>"YES",-variable=>\$use_cd, -background=>'black', -foreground=>'white', -activebackground=>'black', -activeforeground=>'white', -selectcolor=>'black'); # create start analysis button my $but = $mw -> Button(-text=>"Start Analysis", -command =>\&start_analysis, -background=>'white', -foreground=>'black'); # create text area for program messages my $textarea = $mw -> Frame(-background=>'black'); my $text_label = $textarea -> Label(-text=>"Program messages: "); $text_label->configure(-background=>'black', -foreground=>'white'); my $txt = $textarea -> Text(-width=>80, -height=>27); my $srl_y = $textarea -> Scrollbar(-orient=>'v',-command=>[yview => $txt]); my $srl_x = $textarea -> Scrollbar(-orient=>'h',-command=>[xview => $txt]); $txt -> configure(-yscrollcommand=>['set', $srl_y], -xscrollcommand=>['set',$srl_x]); # set layout for all frames # menu bar frame $mbf->grid(-row => 0, -column => 0, -sticky => 'w'); $mbp->grid(-row => 0, -column => 1, -sticky => 'e'); $menu_frame -> grid(-row=>1,-column=>1,-columnspan=>2, -sticky => 'ew'); #parameters frame $spread_label -> grid(-row=>2,-column=>1, -sticky => 'w'); $ent_spread -> grid(-row=>2,-column=>2, -sticky => 'w'); $distortion_label -> grid(-row=>3,-column=>1, -sticky => 'w'); $ent_distortion -> grid(-row=>3,-column=>2, -sticky => 'w'); $unassigned_label -> grid(-row=>4,-column=>1, -sticky => 'w'); $ent_unassigned -> grid(-row=>4,-column=>2, -sticky => 'w'); $output_label -> grid(-row=>5,-column=>1, -sticky => 'w'); $ent_output -> grid(-row=>5,-column=>2, -sticky => 'w'); $label_cd -> grid(-row=>6,-column=>1, -sticky => 'w'); $radio_no -> grid(-row=>6,-column=>2, -sticky => 'w'); $radio_yes -> grid(-row=>7,-column=>2, -sticky => 'w'); $frame_parameters -> grid(-row=>2,-column=>1,-columnspan=>2, -sticky => 'ew'); # button $but -> grid(-row=>4,-column=>2, -sticky => 'w'); # text area frame $text_label -> grid(-row=>1,-column=>1, -sticky => 'w'); $txt -> grid(-row=>2,-column=>1); $srl_y -> grid(-row=>2,-column=>2,-sticky=>"ns"); $srl_x -> grid(-row=>3,-column=>1,-sticky=>"ew"); $textarea -> grid(-row=>5,-column=>1,-columnspan=>2, -sticky => 'ew'); MainLoop(); sub fileOpen{ # define file types that can be opened in file dialog my $types = [ ['Text Files', ['.txt', '.text']], ['All Files', '*', ], ]; my ($option) = @_; if($option eq "conversion"){ $probeset_gene_file = $mw->getOpenFile(-filetypes=>$types, -title=>"Open Conversion File"); # Let user know $txt -> insert('end',"Conversion file selected: $probeset_gene_file\n\n"); } # end if elsif($option eq "setup"){ $expfile = $mw->getOpenFile(-filetypes=>$types, -title=>"Open Experimental Setup file"); # Let user know $txt -> insert('end',"Experimental Setup file selected: $expfile\n\n"); } # end elsif elsif($option eq "chiplist"){ $chipidfile = $mw->getOpenFile(-filetypes=>$types, -title=>"Open Chip List file"); # Let user know $txt -> insert('end',"Chip List file selected: $chipidfile\n\n"); } # end elsif elsif($option eq "parent_A"){ $parental_A_chips_file = $mw->getOpenFile(-filetypes=>$types, -title=>"Parent A Chip List file"); # Let user know $txt -> insert('end',"Parental A Chip List file selected: $parental_A_chips_file\n\n"); } # end elsif elsif($option eq "parent_B"){ $parental_B_chips_file = $mw->getOpenFile(-filetypes=>$types, -title=>"Parent B Chip List file"); # Let user know $txt -> insert('end',"Parental B Chip List file selected: $parental_B_chips_file\n\n"); } # end elsif } # end sub routine fileOpen sub dirOpen{ $cel_dir = $mw->chooseDirectory(-title=>"Select Folder Background Corrected .CEL files"); # Let user know $txt -> insert('end',"Folder Background Corrected .CEL files selected: $cel_dir\n\n"); } # end sub routine dirOpen sub warningDialog{ my ($text) = @_; $mw -> messageBox(-type=>"ok", -message=>$text, -icon=>"error"); } # end sub routine warningDialog sub questionDialog{ my ($text) = @_; my $response = $mw -> messageBox(-type=>"yesno", -message=>$text, -icon=>"question"); return $response; } # end sub routine questionDialog # A function to make a toplevel window for the Help topics sub makeTop { my $title = $_[0]; my $text = $_[1]; my $top = $mw -> Toplevel(); $top->title($title); # frame in the top level my $textarea = $top -> Frame(); my $txt = $textarea -> Text(-width=>80, -height=>27); my $srl_y = $textarea -> Scrollbar(-orient=>'v',-command=>[yview => $txt]); my $srl_x = $textarea -> Scrollbar(-orient=>'h',-command=>[xview => $txt]); $txt -> configure(-yscrollcommand=>['set', $srl_y], -xscrollcommand=>['set',$srl_x]); # insert text $txt -> insert('end',$text); # create close button my $but_close = $textarea -> Button(-text=>"Close this window", -command => sub { destroy $top; } ); # geometry $text_label -> grid(-row=>1,-column=>1, -sticky => 'w'); $txt -> grid(-row=>2,-column=>1); $srl_y -> grid(-row=>2,-column=>2,-sticky=>"ns"); $srl_x -> grid(-row=>3,-column=>1,-sticky=>"ew"); # button $but_close -> grid(-row=>4,-column=>1, -sticky => 'ew'); $textarea -> grid(-row=>5,-column=>1,-columnspan=>2, -sticky => 'ew'); } # end sub routine makeTop # This function will be executed when the start analysis button is pushed sub start_analysis { # get values entered by user $spread = $ent_spread -> get(); $distortion = $ent_distortion -> get(); $unassigned = $ent_unassigned -> get(); $output = $ent_output -> get(); # Check input values if($spread eq ""){ warningDialog("Please enter a GAP SIZE!\n"); return; } # end if elsif($spread < 1){ my $response = questionDialog("Are you sure you want to use a GAP SIZE smaller than 1.0?\nThis will give overlapping distributions!\n"); if($response eq "no"){ return; } # end if } # end elsif if($distortion eq ""){ warningDialog("Please enter an ALLELE DISTORTION value!\n"); return; } # end if elsif($distortion > 80){ my $response = questionDialog("Are you sure you want to allow an ALLELE DISTORTION greater than 80%?\n"); if($response eq "no"){ return; } # end if } # end elsif if($unassigned eq ""){ warningDialog("Please enter a UNASSIGNED DATA value!\n"); return; } # end if elsif($unassigned > 10){ my $response = questionDialog("Are you sure you want to allow more than 10% UNASSIGNED SCORES?\n"); if($response eq "no"){ return; } # end if } # end elsif if($output eq ""){ warningDialog("Please give an output name!\n"); return; } # end if if($cel_dir eq ""){ warningDialog("Please select a folder with the background corrected .CEL files!\n"); return; } # end if if($probeset_gene_file eq ""){ warningDialog("Please select a Conversion file!\n"); return; } # end if if($expfile eq ""){ warningDialog("Please select a Experimental Setup file!\n"); return; } # end if if($chipidfile eq ""){ warningDialog("Please select a Chip List file!\n"); return; } # end if if($parental_A_chips_file eq ""){ warningDialog("Please select a Parental A Chip List file!\n"); return; } # end if if($parental_B_chips_file eq ""){ warningDialog("Please select a Parental B Chip List file!\n"); return; } # end if ################################################################################################ # Define genotypes my $genotype_A = "A"; my $genotype_B = "B"; ################################################################################################ # Open logfile my $logfile = $output."_SFPscanGUI_V1_log.txt"; open(LOGFILE, ">$logfile"); if(!LOGFILE){ warningDialog("Can't open '$logfile'!\n"); return; } # Let user know input files are being read $txt -> insert('end',"Reading input files...\n\n"); ################################################################################################ # Open conversion file with probeset and gene id open(PGFILE, "$probeset_gene_file"); if(!PGFILE){ warningDialog("Can't open '$probeset_gene_file'!\n"); return; } my @pg_data = ; # close input file close(PGFILE); # remove header line shift(@pg_data); # Store in hash for easy lookup later my %pg_hash; # File has format: probeset gene_id # Store in hash with probeset as key, and gene_id as value # $1 is probe set, $2 is gene_id foreach my $line(@pg_data){ my @split_line = split(/\t/, $line); chomp($split_line[1]); $pg_hash{$split_line[0]} = $split_line[1]; } # end foreach # Let user know $txt -> insert('end',"probeset-gene_id file read: $probeset_gene_file\n\n"); print LOGFILE "probeset-gene_id file read: $probeset_gene_file\n\n"; ################################################################################################ # Open inputfile with RIL experiment setup. open(EXPFILE, "$expfile"); if(!EXPFILE){ warningDialog("Can't open '$expfile'!\n"); return; } my @exp_setup = ; # close input file close(EXPFILE); # Store in hash for easy lookup later my %exp_setup_hash; # Get IDs for reps my @rep_IDs = split(/\t/, shift(@exp_setup)); # Check if file header line starting with ';' my $check_header = shift(@rep_IDs); if(substr($check_header, 0, 1) ne ";"){ warningDialog("RIL experiment setup file $expfile does not have the correct format!\n"); print LOGFILE "RIL experiment setup file $expfile does not have the correct format!\n"; return; } # end if # remove newline chars foreach(@rep_IDs){ chomp; } # end foreach # Determine number of reps my $number_reps = scalar @rep_IDs; # Loop through file content and store in hash foreach my $line(@exp_setup){ # remove newline char chomp($line); my @current_line_array = split(/\t/, $line); # get RIL code my $current_RIL_code = shift(@current_line_array); # get rep codes for(my $i = 0; $i < $number_reps; $i++){ $exp_setup_hash{$current_line_array[$i]}->{'rep'} = $i + 1; $exp_setup_hash{$current_line_array[$i]}->{'treatment'} = $rep_IDs[$i]; $exp_setup_hash{$current_line_array[$i]}->{'RIL'} = $current_RIL_code; } # end for } # end foreach # Let user know this is done $txt -> insert('end',"RIL experiment setup file read: $expfile\n\n"); print LOGFILE "RIL experiment setup file read: $expfile\n"; $txt -> insert('end',"Number of reps determined: $number_reps.\n\n"); print LOGFILE "Number of reps determined: $number_reps.\n\n"; ################################################################################################ # Open file with parental chip IDs open(PARENTA, "$parental_A_chips_file"); if(!PARENTA){ warningDialog("Can't open '$parental_A_chips_file'!\n"); return; } my @parental_A_chips = ; # close file close(PARENTA); # remove end of line chars foreach my $line(@parental_A_chips){ chomp($line); } # end foreach open(PARENTB, "$parental_B_chips_file"); if(!PARENTB){ warningDialog("Can't open '$parental_B_chips_file'!\n"); return; } my @parental_B_chips = ; # close file close(PARENTB); # remove end of line chars foreach my $line(@parental_B_chips){ chomp($line); } # end foreach ################################################################################################ # Open file with chip IDs open(CHIPIDFILE, "$chipidfile"); if(!CHIPIDFILE){ warningDialog("Can't open '$chipidfile'!\n"); return; } my @chipids = ; # close file close(CHIPIDFILE); # remove end of line chars foreach my $line(@chipids){ chomp($line); } # end foreach # Let user know $txt -> insert('end',"ChipID file read: $chipidfile\n\n"); print LOGFILE "ChipID file read: $chipidfile\n\n"; ################################################################################################ # Open one chip file to get list of probeset codes. open(FILE, "$chipids[0]"); if(!FILE){ $txt -> insert('end',"Can't open '$chipids[0]'!\n\n"); warningDialog("Can't open '$chipids[0]'!\n"); return; } my @data = ; # close file close(FILE); # remove header line shift(@data); # Get list of probeset names and store in array my @temp; my $previous_probeset; my $current_probeset; my $probe_number = 0; my $first_line_done = "FALSE"; foreach my $line(@data){ if($line =~ /\"(.+?_at)/g){ $current_probeset = $1; # First line case if($first_line_done eq "FALSE"){ $previous_probeset = $current_probeset; $first_line_done = "TRUE"; } # end if # Evaluate if($current_probeset eq $previous_probeset){ $probe_number++; } else{ my @to_add = ($previous_probeset, $probe_number); push(@temp, \@to_add); $probe_number = 1; } # end else } # end if $previous_probeset = $current_probeset; } # end foreach # Sort list my @probesets = sort {lc($a->[0]) cmp lc($b->[0])} @temp; ################################################################################################ # Open all chip files # Create array with filehandles for all files my @chip_fh; # Create filehandles foreach my $file(@chipids){ my $fh = IO::File->new("< $file"); if(!$fh){ $txt -> insert('end',"Can't open '$file'!\n\n"); warningDialog("Can't open '$file'!\n"); return; } my @temp = ($fh, $file); push(@chip_fh, \@temp); } # end foreach ################################################################################################ # Open output file to store for each marker: which parent has the SFP, # max_SFPdev nonSFP parent, min_SFPdev SFP parent, spread, # frequency A allele, frequency B allele, frequency missing data. my $output_0 = $output."_SFPscanGUI_V1_marker_data.txt"; open(OUTPUT0, ">$output_0"); # Print header print OUTPUT0 "SFPmarker\tParental line with SFP\tmax_SFPdev nonSFP parent\tmin_SFPdev SFP parent\tSpread\tA frequency\tB frequency\tmissing data frequency\n"; # Let user know analysis starts $txt -> insert('end',"Starting SFP detection...\n\n"); ################################################################################################ # Loop through probeset. # For each probeset, open all BG corrected RIL files and get expression values for # all probes belonging to that probeset. # Then calculate SFPdev for all probes in the probesets, for all chips. # Store SFPdev values in an array, sort the array, the loop through # the array and test if there are gaps with a spread > 2. # If such a gap is found, check if all chips of one parent fall on the left of the gap, # and all chips of the other parent on the right of the gap. # Thus, if a gap with spread > 2 is found AND the distribution of the parental chips is okay, # then this probe is an SFP marker. # Genotype all the chips falling to the left of the gap as the corresponding parent, # and all the chips falling to the right of the gap as the other parent. # # Result of all detected SFP markers my %result; # Result of all post-genotyping filtered SFP markers my %result_clean; my $line_count = 0; my $number_of_probesets = scalar (@probesets); my $probeset_counter = 0; for (my $i = 0; $i < scalar (@probesets); $i++){ # update mainwindow $mw->update(); #show latest text $txt->see('end'); # my $probeset = $probesets[$i]->[0]; my $number_probes = $probesets[$i]->[1]; # loop through all chip files and get data for this probeset my @SFPdev_current_probeset_all_chips; foreach my $file(@chip_fh){ my @expression_temp; for(my $j = 0; $j < $number_probes; $j++){ *FILE = $file->[0]; my $temp_line = ; # Skip header line in file if($temp_line =~ /.+?\.CEL/g){ $temp_line = ; } # end if my $probeset_file; my $probe; my $expression; if($temp_line =~ /\"(.+?_at)(\d{1,2})\",(\d+?\.\d+)/g){ $probeset_file = $1; if($probeset_file != $probeset){ $txt -> insert('end',"Data not consistent! Probeset in list: $probeset, probeset in file $file->[1]: $probeset_file\n"); warningDialog("Data not consistent! Probeset in list: $probeset, probeset in file $file->[1]: $probeset_file\n"); print LOGFILE "Data not consistent! Probeset in list: $probeset, probeset in file $file->[1]: $probeset_file\n"; close(LOGFILE); return; } # end if $probe = $2; $expression = $3; } # end if else{ $txt -> insert('end',"Error reading data from file $file->[1] near line $line_count.\nData not in expected format!\n"); warningDialog("Error reading data from file $file->[1] near line $line_count.\nData not in expected format!\n"); print LOGFILE "Error reading data from file $file->[1] near line $line_count.\nData not in expected format!\n"; close(LOGFILE); return; } # end else push(@expression_temp, $expression); } # end for # # Calculate SFPdev for this chip for all the probes in this probeset my @SFPdev_current_probeset; for(my $q = 0; $q < $number_probes; $q++){ my $SFPprobe_exp = $expression_temp[$q]; my @nonSFP_expression; for(my $r = 0; $r < $number_probes; $r++){ if($r == $q){ next; } # end if else{ push(@nonSFP_expression, $expression_temp[$r]); } # end else } # end for # Calculate SFPdev # # Abs(SFP expression - Average expression other probes) # SFPdev value = ----------------------------------------------------- # SFP expression # my $nonSFPexp_mean = Statistics::Basic::Mean->new(\@nonSFP_expression)->query; my $SFPdev = abs($SFPprobe_exp - $nonSFPexp_mean)/$SFPprobe_exp; my @to_add = ($SFPdev, $file->[1]); push(@SFPdev_current_probeset, \@to_add); } # end for push(@SFPdev_current_probeset_all_chips, \@SFPdev_current_probeset); } # end foreach # Now I have an array with pointers (one for each chip) to arrays # with pointers (one for each probe) to arrays with SFPdev values and corresponding filenames # for all probes of current probeset # Analyze SFPdevs for each probe seperate # Create a new array for one probe with SFPdev values from all chips for(my $s = 0; $s < $number_probes; $s++){ my @SFPdev_current_probe; foreach my $item(@SFPdev_current_probeset_all_chips){ my @to_add = ($item->[$s]->[0], $item->[$s]->[1]); push(@SFPdev_current_probe, \@to_add); } # end foreach # Sort SFPdev array my @SFPdev_sorted = sort {$a->[0] <=> $b->[0]} @SFPdev_current_probe; # Loop through sorted SFPdev array to see if a gap with spread > user defined spread can be found #################### my $left_pos; my $right_pos; my $gap_count = 0; my $gap_valid = "FALSE"; my $left_pos_set = "FALSE"; my $genotype_left; my $genotype_right; my $SFP_parent; my $marker_spread; my $number_of_chips = scalar @SFPdev_sorted; my $A_freq; my $B_freq; my $nodata_freq; my $number_parental_chips; my $bad_marker = "FALSE"; for(my $k = 0; $k < (scalar @SFPdev_sorted - 1); $k++){ ################ next if($SFPdev_sorted[$k]->[0] == 0); my $current_spread = ($SFPdev_sorted[$k+1]->[0])/($SFPdev_sorted[$k]->[0]); if($current_spread > $spread){ ################# #print "Spread greater than $spread!\n"; $marker_spread = $current_spread; # Check if all chips of one parent fall on one side, # and all chips of the other parent on the other side my $all_A_left = 0; my $all_A_right = 0; my $all_B_left = 0; my $all_B_right = 0; my $number_A_chips = scalar @parental_A_chips; my $number_B_chips = scalar @parental_B_chips; ###### store number of parental chips for later use ################## $number_parental_chips = $number_A_chips + $number_B_chips; ########### check parental chips that fall on left of gap ############# for(my $l = 0; $l < $k + 1; $l++){ foreach my $parental_A_chip(@parental_A_chips){ if($SFPdev_sorted[$l]->[1] eq $parental_A_chip){ $all_A_left++; } # end if } # end foreach foreach my $parental_B_chip(@parental_B_chips){ if($SFPdev_sorted[$l]->[1] eq $parental_B_chip){ $all_B_left++; } # end if } # end foreach } # end for # check parental chips that fall on right of gap for(my $m = $k + 1; $m < scalar @chipids; $m++){ foreach my $parental_A_chip(@parental_A_chips){ if($SFPdev_sorted[$m]->[1] eq $parental_A_chip){ $all_A_right++; } # end if } # end foreach foreach my $parental_B_chip(@parental_B_chips){ if($SFPdev_sorted[$m]->[1] eq $parental_B_chip){ $all_B_right++; } # end if } # end foreach } # end for # # Check if parental distributions allow this gap to be valid my $probenumber; if(($all_A_left == $number_A_chips) and ($all_B_right == $number_B_chips)){ $gap_valid = "TRUE"; $genotype_left = "A"; $genotype_right = "B"; $SFP_parent = "B"; # Update gap counter $gap_count++; } # end if elsif(($all_B_left == $number_B_chips) and ($all_A_right == $number_A_chips)){ $gap_valid = "TRUE"; $genotype_left = "B"; $genotype_right = "A"; $SFP_parent = "A"; # Update gap counter $gap_count++; } # end elsif else{ $gap_valid = "FALSE"; } # end else # # Set left position of gap # Only do this if the gap is valid and the left position has not been set yet if(($gap_valid eq "TRUE") and ($left_pos_set eq "FALSE")){ $left_pos = $k; $left_pos_set = "TRUE"; } # end if # Set right position of gap # Only do this if the gap is valid if($gap_valid eq "TRUE"){ $right_pos = $k + 1; } # end if } # end if } # end for ############ Do genotyping if a valid gap was found ############## if($gap_valid eq "TRUE"){ ############# calculations for allele frequencies ################# if($genotype_left eq "A"){ $A_freq = (($left_pos + 1) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips); $B_freq = (($number_of_chips - $right_pos) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips); } # end if else{ $B_freq = (($left_pos + 1) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips); $A_freq = (($number_of_chips - $right_pos) - ($number_parental_chips/2))/($number_of_chips - $number_parental_chips); } # end else ############ calculate no data frequency ######################## $nodata_freq = ($right_pos - $left_pos - 1)/($number_of_chips - $number_parental_chips); ############ Determine if marker should be discarded in post-genotyping filtering ################ if( ($A_freq > ($distortion/100)) or ($B_freq > ($distortion/100)) or ($nodata_freq > ($unassigned/100))){ $bad_marker = "TRUE"; } # end if ############ Report SFP marker ################## my $probenumber = $s + 1; $txt -> insert('end',"SFPmarker found: $probeset, $pg_hash{$probeset}-$probenumber\n"); print LOGFILE "SFPmarker found: $probeset, $pg_hash{$probeset}-$probenumber\n"; LOGFILE->autoflush(1); # Write to result file which parent has the SFP for this marker, max_SFPdev nonSFP parent, # min_SFPdev SFP parent, frequency A allele, frequency B allele, frequency missing data. print OUTPUT0 "$pg_hash{$probeset}-$probenumber\t$SFP_parent\t$SFPdev_sorted[$left_pos]->[0]\t$SFPdev_sorted[$right_pos]->[0]\t$marker_spread\t$A_freq\t$B_freq\t$nodata_freq\n"; OUTPUT0->autoflush(1); # Genotype on left of gap for(my $l = 0; $l < $left_pos + 1; $l++){ # Store result in %result hash # Get chip code from file code for storing genotyping results my $chip_code = $SFPdev_sorted[$l]->[1]; chomp($chip_code); $chip_code =~ s/.CEL//g; my $RIL = $exp_setup_hash{$chip_code}->{'RIL'}; my $SFPmarker = $pg_hash{$probeset}."-".$probenumber; $result{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_left; # Store result in post-genotyping filtered result hash if marker is not bad if($bad_marker eq "FALSE"){ $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_left; } # end if } # end for # Genotype on right of gap for(my $m = $right_pos; $m < scalar @chipids; $m++){ # Store result in %result hash # Get chip code from file code for storing genotyping results my $chip_code = $SFPdev_sorted[$m]->[1]; chomp($chip_code); $chip_code =~ s/.CEL//g; my $RIL = $exp_setup_hash{$chip_code}->{'RIL'}; my $SFPmarker = $pg_hash{$probeset}."-".$probenumber; $result{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_right; # Store result in post-genotyping filtered result hash if marker is not bad if($bad_marker eq "FALSE"){ $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = $genotype_right; } # end if } # end for # Genotype within gap as '-' if($gap_count > 1){ for(my $n = $left_pos + 1; $n < $right_pos; $n++){ # Store result in %result hash # Get chip code from file code for storing genotyping results my $chip_code = $SFPdev_sorted[$n]->[1]; chomp($chip_code); $chip_code =~ s/.CEL//g; my $RIL = $exp_setup_hash{$chip_code}->{'RIL'}; my $SFPmarker = $pg_hash{$probeset}."-".$probenumber; $result{$RIL}->{$SFPmarker}->{$chip_code} = "-"; # Store result in post-genotyping filtered result hash if marker is not bad if($bad_marker eq "FALSE"){ $result_clean{$RIL}->{$SFPmarker}->{$chip_code} = "-"; } # end if } # end for } # end if } # end if } # end for # update line counter $line_count = $line_count + $number_probes; # update probeset counter $probeset_counter++; $txt -> insert('end',"probeset $probeset_counter of $number_of_probesets.\n"); } # end foreach # Close output file close(OUTPUT0); ################################################################################################ # Write this result to a file in format: # SFPmarker RIL chip rep treatment genotype my $output_1 = $output."_SFPscanGUI_V1_genotypes_all_chips.txt"; open(OUTPUT1, ">$output_1"); if(!OUTPUT1){ $txt -> insert('end',"Can't open '$output_1'!\n"); warningDialog("Can't open '$output_1'!\n"); return; } # Print header print OUTPUT1 "SFPmarker\tRIL\tchip\trep\tgenotype\n"; # Print values # Remember format of result hash: # %result -> RIL1 -> SFPmarker1 -> chip1 = genotype A or B # -> chip2 = genotype A or B # -> chip3 = genotype A or B # -> chip4 = genotype A or B # -> SFPmarker2 -> chip1 = genotype A or B # etc # etc # etc while(my ($RIL, $SFPmarker) = each (%result)){ while(my ($SFPmarker, $chip) = each %$SFPmarker){ while(my ($chip, $genotype) = each %$chip){ print OUTPUT1 "$SFPmarker\t"; print OUTPUT1 "$RIL\t"; print OUTPUT1 "$chip\t"; print OUTPUT1 "$exp_setup_hash{$chip}->{'rep'}\t"; print OUTPUT1 "$genotype\n"; } # end while } # end while } # end while close(OUTPUT1); $txt -> insert('end',"Output file '$output_1' written...\n"); print LOGFILE "Output file '$output_1' written...\n"; # Let user know genotyping starts $txt -> insert('end',"Starting RIL genotyping...\n\n"); ################################################################################################ # Do genotyping for all RILs with all detected SFP markers my %SFPmarker_calls; genotype(\%result, $genotype_A, $genotype_B, $number_reps, \%SFPmarker_calls, \*LOGFILE, $use_cd); # Do genotyping for all RILs with post-genotyping filtered SFP markers my %SFPmarker_calls_clean; genotype(\%result_clean, $genotype_A, $genotype_B, $number_reps, \%SFPmarker_calls_clean, \*LOGFILE, $use_cd); ################################################################################################ # Write genotyping data for RILs done with all markers to files. # Open files my $output_2_jm = $output."_SFPscanGUI_V1_genotypes_jm.txt"; open(OUTPUT2JM, ">$output_2_jm"); if(!OUTPUT2JM){ $txt -> insert('end',"Can't open '$output_2_jm'!\n"); warningDialog("Can't open '$output_2_jm'!\n"); return; } my $output_2_ex = $output."_SFPscanGUI_V1_genotypes_ex.txt"; open(OUTPUT2EX, ">$output_2_ex"); if(!OUTPUT2EX){ $txt -> insert('end',"Can't open '$output_2_ex'!\n"); warningDialog("Can't open '$output_2_ex'!\n"); return; } write_genotypes(\%SFPmarker_calls, \*OUTPUT2JM, \*OUTPUT2EX, $output_2_jm, $output_2_ex, \*LOGFILE); # close files close(OUTPUT2JM); close(OUTPUT2EX); ################################################################################################ # Write genotyping data for RILs done with post-genotyping filtered markers to files. # Open files my $output_2_jm_clean = $output."_SFPscanGUI_V1_genotypes_clean_jm.txt"; open(OUTPUT2JMCLEAN, ">$output_2_jm_clean"); if(!OUTPUT2JMCLEAN){ $txt -> insert('end',"Can't open '$output_2_jm_clean'!\n"); warningDialog("Can't open '$output_2_jm_clean'!\n"); return; } my $output_2_ex_clean = $output."_SFPscanGUI_V1_genotypes_clean_ex.txt"; open(OUTPUT2EXCLEAN, ">$output_2_ex_clean"); if(!OUTPUT2EXCLEAN){ $txt -> insert('end',"Can't open '$output_2_ex_clean'!\n"); warningDialog("Can't open '$output_2_ex_clean'!\n"); return; } write_genotypes(\%SFPmarker_calls_clean, \*OUTPUT2JMCLEAN, \*OUTPUT2EXCLEAN, $output_2_jm_clean, $output_2_ex_clean, \*LOGFILE); # close files close(OUTPUT2JMCLEAN); close(OUTPUT2EXCLEAN); ################################################################################################ # Say bye and close log file print LOGFILE "Done. Enjoy!\n"; close(LOGFILE); # Tell user to check logfile $txt -> insert('end',"Done!\n\nCheck log: $logfile.\n"); } # end sub routine start analysis ####################################################################################### ############################ Sub routine genotype ##################################### ####################################################################################### sub genotype { ###################################################################### # Now evaluate the genotyping for the specified number of reps of each RIL # Create new hash with new genotyping codes, one for each RIL # A = SHA, B = BAY, C = NOT A, D = NOT B, - = A+B, - = no data # If all rep chips are genotyped as one accession: use A or B # If half of the rep chips is genotyped for one accession, # and the other half of the rep chips for the other accession: use - # If half or more than half of the rep chips is genotyped as one accession, # and the other rep chips as no data: use C or D # NOTE: Use the C and D scores only if the user wants to use it !!! # Else, use: - # # Arguments to receive from sub routine call: # 1. reference to result hash with format: # %result -> RIL1 -> SFPmarker1 -> chip1 = genotype A or B # -> chip2 = genotype A or B # -> chip3 = genotype A or B # -> chip4 = genotype A or B # -> SFPmarker2 -> chip1 = genotype A or B # etc # etc # etc # # 2. genotype codes ($genotype_A and $genotype_B) # # 3. number of reps ($number_reps) # # 4. reference to hash to store result # # 5. reference to logfile FILE handle # # 6. use_cd option: yes or no. # # # Return variable: # hash %SFPmarker_calls with format: # SFPmarker_calls -> SFPmarker -> $RIL = $genotype; # # # Retrieve arguments my $inputresult_ref = $_[0]; my $genotype_A = $_[1]; my $genotype_B = $_[2]; my $number_reps = $_[3]; my $outputresult_ref = $_[4]; my $log_ref = $_[5]; my $use_cd = $_[6]; # # Loop through RILs and store calls while(my ($RIL, $SFPmarker) = each (%{$inputresult_ref})){ # Loop through SFP markers while(my ($SFPmarker, $chip) = each (%$SFPmarker)){ my $genotype_to_add; # use a hash as counter, key is genotype from previous step my %genotype; $genotype{$genotype_A} = 0; $genotype{$genotype_B} = 0; $genotype{'-'} = 0; my $chipcount = 0; # Loop through chips while(my ($chip, $call) = each %$chip){ $genotype{$call}++; $chipcount++; } # end while if($chipcount != $number_reps){ $genotype_to_add = '-'; $txt -> insert('end',"RIL $RIL only has $chipcount chips! Check input files! Genotyped as '-' (no data).\n"); print {$log_ref} "RIL $RIL only has $chipcount chips! Check input files! Genotyped as '-' (no data).\n"; next; } # end if elsif($genotype{$genotype_A} == $number_reps){ $genotype_to_add = 'A'; } # end elsif elsif($genotype{$genotype_B} == $number_reps){ $genotype_to_add = 'B'; } # end elsif elsif(($genotype{$genotype_A} == ($number_reps/2)) and ($genotype{$genotype_B} == ($number_reps/2))){ $genotype_to_add = '-'; } # end elsif elsif(($genotype{$genotype_A} >= ($number_reps/2)) and ($genotype{$genotype_B} == 0)){ if($use_cd eq "YES"){ $genotype_to_add = 'D'; } # end if else{ $genotype_to_add = '-'; } # end else } # end elsif elsif(($genotype{$genotype_B} >= ($number_reps/2)) and ($genotype{$genotype_A} == 0)){ if($use_cd eq "YES"){ $genotype_to_add = 'C'; } # end if else{ $genotype_to_add = '-'; } # end else } # end elsif else{ $genotype_to_add = '-'; } # end elsif $outputresult_ref->{$SFPmarker}->{$RIL} = $genotype_to_add; } # end while $txt -> insert('end',"RIL '$RIL' genotyped...\n"); print {$log_ref} "RIL '$RIL' genotyped...\n"; } # end while $txt -> insert('end',"All RILs genotyped...\n"); print {$log_ref} "All RILs genotyped...\n"; } # end sub routine genotype ####################################################################################### ############################ Sub routine write_genotypes ############################## ####################################################################################### sub write_genotypes { ####################################################################### # Write result of genotyping to files in two different formats: # 1) for JoinMap # marker14 # DHBHHADAADBADBHHDBHHBBDHHADDBAHDAHHBHAABABDAHHBBHH # marker30 # AHBAHBBBHABBAHAABHBBHHHBBHHHHBAHABHHHHAHHHAHBBAHA-H # marker35 # BHHDHAABHDHHBHHABABBAAAABHBHHDHHBBDCHDHDHHBAHBA-DA # etc # # 2) For visualization in Excel # marker14 D H B H H A D A A D B etc # etc # This should be tabulated. # # Arguments to receive: # 1. reference to SFPmarker_calls hash # Remember format SFPmarker_calls hash: # SFPmarker_calls -> SFPmarker -> $RIL = $genotype; # # 2. reference to output file handle for joinmap format # # 3. reference to output file handle for Excel format # # 4. name file joinmap format # # 5. name file excel format # # 6. reference to logfile # # # Retrieve arguments my $hash_ref = $_[0]; my $file_jm_ref = $_[1]; my $file_ex_ref = $_[2]; my $jm_file_name = $_[3]; my $ex_file_name = $_[4]; my $log_ref = $_[5]; # Print header print {$file_jm_ref} "\;RIL genotyping with SFP markers\n\;$number_reps chips per RIL\n"; print {$file_ex_ref} "\;RIL genotyping with SFP markers\n\;$number_reps chips per RIL\n"; # Loop through markers # Arrays for excel output my @markers; my @calls; my @RILs_list; while(my ($SFPmarker, $RIL) = each (%{$hash_ref})){ # Marker calls for all RILs my $genotypes_string = ""; # Print marker name print {$file_jm_ref} "$SFPmarker\n"; push(@markers, $SFPmarker); # Get keys containing RILs my @RILs = keys %$RIL; # Sort RILs my @RILs_sorted = sort { $a <=> $b } @RILs; @RILs_list = @RILs_sorted; # Get genotype call for all RILs my @current_calls; foreach my $item(@RILs_sorted){ $genotypes_string = $genotypes_string.$hash_ref->{$SFPmarker}->{$item}; push(@current_calls, $hash_ref->{$SFPmarker}->{$item}); } # end foreach # Print string print {$file_jm_ref} "$genotypes_string\n"; push(@calls, \@current_calls); } # end while # Output header line with RIL numbers to excel format file print {$file_ex_ref} "\t"; foreach my $item(@RILs_list){ print {$file_ex_ref} "$item\t"; } # end foreach print {$file_ex_ref} "\n"; # Print info for each marker for(my $i = 0; $i < scalar @markers; $i++){ print {$file_ex_ref} "$markers[$i]\t"; my $current_calls = @calls[$i]; foreach my $call(@$current_calls){ print {$file_ex_ref} "$call\t"; } # end foreach print {$file_ex_ref} "\n"; } # end for $txt -> insert('end',"Output file '$jm_file_name' written...\n"); print {$log_ref} "Output file '$jm_file_name' written...\n"; $txt -> insert('end',"Output file '$ex_file_name' written...\n"); print {$log_ref} "Output file '$ex_file_name' written...\n"; } # end sub routine write_genotypes