#!/usr/bin/perl
package Datadump;

#######################################################################
# Author:     Stan Dong
# Date:       December 2001
# Comments:   This package contains all nexessary methods to create
#             data dump files at /share/ftp/Dictyostelium/data_dump
# Documentation
#      http:///usr/local/dicty/www_dictybase/db/lib/staff/dictyBase/programmer/object_docs/
#             Datadump.html
#######################################################################
 
use strict;
use DBI;
use lib "/usr/local/dicty/www_dictybase/db/lib/common";
use Login qw (ConnectToDatabase);
use TextUtil qw (DeleteUnwantedChar);
use lib "/usr/local/dicty/www_dictybase/db/lib/dictyBase";
 
#######################################################################
#################### global variables #################################
#######################################################################

my $database;
my $dbuser = 'OTTO';
my $password;
my $dbh;

my %locusdictyBaseid;
my %featdictyBaseid;
my %dictyBaseidOtherFeat; 
my %GeneNonStandard;

#######################################################################
sub new {      ############ constructor ###############################
#######################################################################
       
    my ($class, %args) = @_;

    my $self = {};

    bless $self, $class;

    $database = $args{'database'};

    $password = `/usr/local/bin/db-auth ${dbuser}\@${database}`;

    chomp $password;

    $dbh = &ConnectToDatabase($database, $dbuser, $password);

    $self->get_dictyBaseid;

    $self->get_geneNonStandard;

    return $self;
}

#######################################################################
sub get_dictyBaseid  {
#######################################################################

    my $locus_sth = $dbh->prepare("
        SELECT s.dictyBaseid, s.primary_key, f.feature_no
        FROM CGM_DDB.dictyBaseid s, CGM_DDB.feature f 
        WHERE s.dictyBaseid_type = ? and 
              s.tab_name = 'LOCUS' and 
              s.primary_key = f.locus_no (+)
              
    ");

    $locus_sth->execute('Primary');

    while (my ($dictyBaseid, $locusNo, $featNo) = $locus_sth->fetchrow()) {

        $locusdictyBaseid{$locusNo} = $dictyBaseid if ($locusNo);

        $featdictyBaseid{$featNo} = $dictyBaseid if ($featNo);
    }

    my $feat_sth = $dbh->prepare("

        SELECT s.dictyBaseid, s.primary_key, f.locus_no
        FROM CGM_DDB.dictyBaseid s, CGM_DDB.feature f 
        WHERE s.dictyBaseid_type = ? and 
              s.tab_name = 'FEATURE' and 
              s.primary_key = f.feature_no              
    ");

    $feat_sth->execute('Primary');

    while (my ($dictyBaseid, $featNo, $locusNo) = $feat_sth->fetchrow()) {

        $featdictyBaseid{$featNo} = $dictyBaseid if ($featNo);

        $locusdictyBaseid{$locusNo} = $dictyBaseid if ($locusNo);
    }

    my $dictyBaseidOther = $dbh->prepare("

        SELECT primary_key, dictyBaseid
        FROM CGM_DDB.dictyBaseid
        WHERE tab_name = 'FEATURE' and 
              dictyBaseid_type in ('Secondary','Deleted') 
        ORDER by dictyBaseid
    ");  

    $dictyBaseidOther->execute;

    while (my ($priKey, $dictyBaseid) = $dictyBaseidOther->fetchrow())  {

        if (my $exist = $dictyBaseidOtherFeat{$priKey})  {

            #if more than one, separate them with '|'
            $dictyBaseidOtherFeat{$priKey} = $exist."|".$dictyBaseid;           
        }
        else  {
            $dictyBaseidOtherFeat{$priKey} = $dictyBaseid if ($priKey);
        }
    }

    return;
}

#######################################################################
sub get_geneNonStandard {
#######################################################################
    
    my $sth = $dbh->prepare("
        SELECT l.locus_name 
        FROM CGM_DDB.locus l, CGM_DDB.gene_reservation g
        WHERE g.locus_no = l.locus_no AND
              g.is_standardized = 'N'
    ");
    
    $sth->execute;

    while (my $locus = $sth->fetchrow) {
	$GeneNonStandard{$locus} = 1;
    }

    return;
}

#######################################################################
sub get_external_id {
#######################################################################

    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare("

        SELECT e.external_id, e.source, l.locus_name, l.locus_no, e.tab_name   
        FROM CGM_DDB.external_id e, CGM_DDB.locus l 
        WHERE tab_name = 'LOCUS' and  
              e.primary_key = l.locus_no 
        UNION
        SELECT e.external_id, e.source, f.feature_name, f.feature_no, e.tab_name  
        FROM CGM_DDB.external_id e, CGM_DDB.feature f 
        WHERE tab_name = 'FEATURE' and 
              e.primary_key = f.feature_no
    ");    

    $sth->execute();

    while (my @result = $sth->fetchrow()) {

        my $dictyBaseid;

        if ($result[4] eq 'LOCUS')  {

            $dictyBaseid = $locusdictyBaseid{$result[3]}; 
            
        }
        elsif ($result[4] eq 'FEATURE')  {

            $dictyBaseid = $featdictyBaseid{$result[3]};
        }
        
        #remove non-standardized gene name
        if ( defined $GeneNonStandard{$result[2]} ) {
	    $result[2] = '';
        }

        @result = $self->cleanup(@result);

        print OUT "$result[0]\t$result[1]\t$result[2]\t$dictyBaseid\n";
    }

    close OUT;

    return;
}

###################################################################
sub get_goAnnotation  {
###################################################################

    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";
    
    my $sth = $dbh->prepare("

        SELECT f.feature_name, l.locus_name, f.feature_no, g.go_aspect, g.go_term, g.goid, ge.evidence_code, l.locus_no  
        FROM CGM_DDB.feature f, CGM_DDB.locus l, CGM_DDB.go_feat_goev gfg, CGM_DDB.go g, CGM_DDB.go_evidence ge 
        WHERE f.feature_no = gfg.feature_no and 
              gfg.goid = g.goid and 
              gfg.go_evidence_no = ge.go_evidence_no and 
              l.locus_no (+) = f.locus_no 
        UNION 
        SELECT f.feature_name, l.locus_name, f.feature_no, g.go_aspect, g.go_term, g.goid, ge.evidence_code, l.locus_no  
        FROM CGM_DDB.feature f, CGM_DDB.locus l, CGM_DDB.go_locus_goev glg, CGM_DDB.go g, CGM_DDB.go_evidence ge 
        WHERE l.locus_no = glg.locus_no and 
              glg.goid = g.goid and 
              glg.go_evidence_no = ge.go_evidence_no and 
              l.locus_no = f.locus_no (+)
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {

        if ($result[0])  {
            
            #get dictyBaseid with feature_no
            $result[2] = $featdictyBaseid{$result[2]};
        }
        elsif ($result[1])   {

            #get dictyBaseid with locus_no
            $result[2] = $locusdictyBaseid{$result[7]};
        }

        #remove non-standardized gene names
	if ( defined $GeneNonStandard{$result[1]} ) {
	    $result[1] = '';
        }

	@result = $self->cleanup(@result);

        my $row = join("\t", @result[0,1,2,3,4,5,6]);

        print OUT "$row\n";
    }
    
    close OUT;

    return;
}

###################################################################
sub get_goTerm  {
###################################################################

    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";
    
    my $sth = $dbh->prepare("

        SELECT goid, go_term, go_aspect, go_definition 
        FROM CGM_DDB.go 
        ORDER BY goid
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {
	@result = $self->cleanup(@result);

        my $row = join("\t", @result);

        print OUT "$row\n";
    }
    
    close OUT;

    return;
}

###################################################################
sub get_phenotypes  {
###################################################################

    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";
    
    my $sth = $dbh->prepare("

        SELECT f.feature_name, l.locus_name, f.feature_no, lp.phenotype_type, p.phenotype, lp.sentence, l.locus_no  
        FROM CGM_DDB.feature f, CGM_DDB.locus l, CGM_DDB.locus_pheno lp, CGM_DDB.phenotype p  
        WHERE l.locus_no = f.locus_no (+) and
              l.locus_no = lp.locus_no and
              p.phenotype_no = lp.phenotype_no 
        UNION  
        SELECT f.feature_name, l.locus_name, f.feature_no, fp.phenotype_type, p.phenotype, fp.sentence, l.locus_no   
        FROM CGM_DDB.feature f, CGM_DDB.locus l,CGM_DDB.feat_pheno fp,CGM_DDB.phenotype p  
        WHERE l.locus_no (+) = f.locus_no and 
              f.feature_no = fp.feature_no and
              p.phenotype_no = fp.phenotype_no
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {
        
	@result = $self->cleanup(@result);
        if ($result[0])  {
            
            #get dictyBaseid with feature_no
            $result[2] = $featdictyBaseid{$result[2]};
        }
        elsif ($result[1])   {

            #get dictyBaseid with locus_no
            $result[2] = $locusdictyBaseid{$result[6]};
        }

        #remove non-standardized gene names
	if ( defined $GeneNonStandard{$result[1]} ) {
	    $result[1] = '';
        }

	@result = $self->cleanup(@result);

        my $row = join("\t", @result[0,1,2,3,4,5]);

        print OUT "$row\n";
    }
    
    close OUT;

    return;
}

###################################################################
sub get_geneticMap  {
###################################################################

    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";
    
    my $sth = $dbh->prepare("

        SELECT tp.two_point_name, tp.parental_ditype, tp.nonparental_ditype, tp.tetratype, tp.first_division, tp.second_division, tp.distance, tp.std_err, tp.interference, tp.interference_std_err, tp.remark, l1.locus_name, f1.feature_name, l1.chromosome, l1.genetic_position, l1.locus_no, l2.locus_name, f2.feature_name, l2.chromosome, l2.genetic_position, l2.locus_no, f1.feature_no, f2.feature_no  
        FROM CGM_DDB.two_point tp, CGM_DDB.locus_twopt lt1, CGM_DDB.feature f1, 
          CGM_DDB.locus l1, CGM_DDB.locus_twopt lt2, CGM_DDB.feature f2, CGM_DDB.locus l2  
        WHERE tp.two_point_no = lt1.two_point_no and 
              lt1.locus_no = l1.locus_no and 
              l1.locus_no = f1.locus_no (+) and
              tp.two_point_no = lt2.two_point_no and
              lt2.locus_no = l2.locus_no and 
              l2.locus_no = f2.locus_no (+) and 
              l1.locus_no < l2.locus_no  
        ORDER BY tp.two_point_name 
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {

        if ($result[21])   {

            #get dictyBaseid with feature_no
            $result[15] = $featdictyBaseid{$result[21]};
        }
        elsif ($result[15])  {
            
            #get dictyBaseid with locus_no
            $result[15] = $locusdictyBaseid{$result[15]};
        }

        if ($result[22])  {

            $result[20] = $featdictyBaseid{$result[22]};
        }
        elsif ($result[20])  {

            $result[20] = $locusdictyBaseid{$result[20]};
        }

        #truncate the array
        $#result = 20;

        #remove non-standardized gene name
        if ( defined $GeneNonStandard{$result[11]} ) {
	    $result[11] = '';
        }
        if ( defined $GeneNonStandard{$result[16]} ) {
	    $result[16] = '';
        }

	@result = $self->cleanup(@result);

        my $row = join("\t", @result);

        print OUT "$row\n";
    }
    
    close OUT;

    return;
}

###################################################################
sub get_chromFeature  {
###################################################################

    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare("        
        SELECT distinct f.feature_name, l.locus_name, f.feature_no, ft.feature_type, f.chromosome, f.start_coord, f.stop_coord, f.strand, f.feature_no, f.feature_no, f.brief_id, f.date_created, f.feature_version, l.genetic_position, l.enzyme, g.reservation_date, g.is_standardized   
        FROM CGM_DDB.feature f, CGM_DDB.feature_type ft, CGM_DDB.locus l, CGM_DDB.gene_reservation g 
        WHERE f.feature_no = ft.feature_no  and 
              ft.feature_no not in (SELECT fe.feature_no FROM CGM_DDB.feature fe, CGM_DDB.feature_type fty WHERE fe.feature_no = fty.feature_no and fty.feature_type in ('Deleted', 'Merged')) and
              f.locus_no = l.locus_no (+) and 
              l.locus_no = g.locus_no (+) 
    ");			    
   
    $sth->execute;

    my $arrayRef = $sth->fetchall_arrayref();

    #get aliases
    my $alias_sth = $dbh->prepare("

        SELECT f.feature_no, a.alias_name
        FROM CGM_DDB.feature f, CGM_DDB.locus l, CGM_DDB.locus_alias la, CGM_DDB.alias a 
        WHERE f.locus_no = l.locus_no and
              l.locus_no = la.locus_no (+) and 
              la.alias_no  = a.alias_no (+)
        UNION
        SELECT f.feature_no, a.alias_name
        FROM CGM_DDB.feature f, CGM_DDB.alias a
        WHERE f.feature_no = a.feature_no 
    ");

    $alias_sth->execute;

    my %aliasFeat;

    while (my ($featNo, $alias) = $alias_sth->fetchrow())  {

        next if (!$alias);

        if (!$aliasFeat{$featNo})  {
            $aliasFeat{$featNo} = $alias
        }
        else {
            $aliasFeat{$featNo} = $aliasFeat{$featNo}.'|'.$alias;
        }
    }

    #for description field, get gene_product, then get description in locus table. If none of the  two above, use brief_id in feature table.
    my %description;
    my %geneProduct;
        
    my $gp_sth = $dbh->prepare("
        SELECT l.locus_name, gp.gene_product
        FROM CGM_DDB.locus l, CGM_DDB.gene_product gp, CGM_DDB.locus_gp lg
        WHERE l.locus_no = lg.locus_no AND
              lg.gene_product_no = gp.gene_product_no AND
              gp.gene_product is not null
    ");

    $gp_sth->execute;

    while ( my ($lnm, $gp) = $gp_sth->fetchrow ) {
	$geneProduct{$lnm} = $gp;
    }
    $gp_sth->finish;

    my $desc_sth = $dbh->prepare("
        SELECT locus_name, description 
        FROM CGM_DDB.locus 
        WHERE description is not null
    ");

    $desc_sth->execute;

    while ( my ($lnm, $des) =  $desc_sth->fetchrow ) {
	$description{$lnm} = $des;
    }
    $desc_sth->finish;

    my $formatAref = $self->execute_format($arrayRef);

    foreach my $lineAref (@$formatAref)   {

        my $featNo = $$lineAref[2];
        my $locusName = $$lineAref[1];

        if ($locusName) {
	    if ( $geneProduct{$locusName} ) {
		$$lineAref[10] = $geneProduct{$locusName};
            }
            elsif ( $description{$locusName} ) {
		$$lineAref[10] = $description{$locusName};
	    }
        }

        #get aliases
        $$lineAref[2] = $aliasFeat{$featNo};

        #get primarydictyBaseid
        $$lineAref[8] = $featdictyBaseid{$featNo};
        
        #get dictyBaseidOther
        $$lineAref[9] = $dictyBaseidOtherFeat{$featNo};

        #remove non-standard gene names
        if (defined $GeneNonStandard{$locusName} ) {
	    $$lineAref[1] = '';
        }

        my @result = @$lineAref;
	@result = $self->cleanup(@result);

        my $row = join("\t", @result);

        print OUT "$row\n";
    }

    close OUT;

    return;
}


###################################################################
sub get_annotationChange  {
###################################################################

    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare("        

        SELECT h.feature_name, ft.feature_type, h.chromosome, h.start_coord, h.stop_coord, h.strand, h.feature_no, h.feature_no, f.feature_name, f.feature_no, h.brief_id, c.note, h.date_created   
        FROM CGM_DDB.feature f, CGM_DDB.feature_type ft, CGM_DDB.alias a, CGM_DDB.feat_cn fc, CGM_DDB.curator_note c, 
(SELECT fe.feature_no, fe.feature_name, fe.chromosome, fe.start_coord, fe.stop_coord, fe.strand, fe.brief_id, fty.date_created  
FROM CGM_DDB.feature fe, CGM_DDB.feature_type fty 
WHERE fe.feature_no = fty.feature_no and fty.feature_type in ('Deleted', 'Merged')) h   
        WHERE h.feature_no = ft.feature_no (+) and 
              h.feature_name = a.alias_name (+) and 
              a.feature_no = f.feature_no (+) and  
              h.feature_no = fc.feature_no (+) and  
              fc.curator_note_no = c.curator_note_no (+) 
        ORDER BY h.feature_name 
    ");
	
    $sth->execute;

    my $arrayRef = $sth->fetchall_arrayref();

    my $formatAref = $self->execute_format($arrayRef);

    foreach my $lineAref (@$formatAref)   {

        #get primary dictyBaseid and dictyBaseidOther of changed feature
        $$lineAref[6] = $featdictyBaseid{$$lineAref[6]};

        $$lineAref[7] = $dictyBaseidOtherFeat{$$lineAref[7]};
        
        #get primary dictyBaseid of current feature
        $$lineAref[9] = $featdictyBaseid{$$lineAref[9]};

        my @result = @$lineAref;
	@result = $self->cleanup(@result);

        my $row = join("\t", @result);

        print OUT "$row\n";
    }

    close OUT;
    
    return;
}

############################################################
sub get_clone  {
############################################################
    
    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare(" 
        SELECT atcc_name, washu_name, chromosome, start_coord, stop_coord  
        FROM CGM_DDB.clone 
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {

	@result = $self->cleanup(@result);

        my $row = join("\t", @result);

        print OUT "$row\n";
    }

    close OUT;

    return;
}

############################################################
sub get_sage  {
############################################################
    
    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare(" 
        SELECT stl.tag_seq, stl.chromosome, stl.class, stl.start_coord, stl.strand, f.feature_name, st.genome_hits, se.condition, se.exp_value  
        FROM CGM_DDB.sage_tag_location stl, CGM_DDB.feature f, CGM_DDB.sage_tag st, CGM_DDB.sage_expression se  
        WHERE stl.feature_no = f.feature_no (+) and
              stl.tag_seq = st.tag_seq and
              st.tag_seq = se.tag_seq
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {

	@result = $self->cleanup(@result);

        my $row = join("\t", @result);

        print OUT "$row\n";
    }

    close OUT;

    return;
}

############################################################
sub get_geneReference  {
############################################################
    
    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare(" 
        SELECT r.pubmed, r.citation, l.locus_name, f.feature_name, lg.literature_topic, f.feature_no, l.locus_no   
        FROM CGM_DDB.reference r, CGM_DDB.locus_gene_info lg, CGM_DDB.locus l, CGM_DDB.feature f  
        WHERE r.reference_no = lg.reference_no and 
              lg.locus_no = l.locus_no and  
              l.locus_no = f.locus_no (+)  
        UNION  
        SELECT r.pubmed, r.citation, l.locus_name, f.feature_name, fg.literature_topic, f.feature_no, l.locus_no  
        FROM CGM_DDB.reference r, CGM_DDB.feat_gene_info fg, CGM_DDB.locus l, CGM_DDB.feature f  
        WHERE r.reference_no = fg.reference_no and 
              fg.feature_no = f.feature_no and  
              l.locus_no (+) = f.locus_no  
    ");

    $sth->execute;

    my $arrayRef = $sth->fetchall_arrayref();

    #get primary dictyBaseids    
    foreach my $lineAref (@$arrayRef)   {

        if ($$lineAref[5]) {
            $$lineAref[5] =  $featdictyBaseid{$$lineAref[5]};
        }
        elsif ($$lineAref[6]) {
            $$lineAref[5] =  $locusdictyBaseid{$$lineAref[6]};
        }
    }

    my $compare = '1,5';
    my $multi = '4';

    my $formatAref = $self->execute_advance_format($arrayRef, $compare, $multi);

    foreach my $aref (@$formatAref)  {

	my @result = @$aref[0 .. 5];

        #remove non-standardized gene names
	if ( defined $GeneNonStandard{$result[2]} ) {
	    $result[2] = '';
        }

        @result = $self->cleanup(@result);

        my $row = join("\t", @result);

        print OUT "$row\n";
    }

    close OUT;

    return;

}

############################################################
sub get_locus  {
############################################################
    
    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare("
        SELECT distinct l.locus_name, l.chromosome, l.genetic_position, l.locus_no, l.enzyme,l.description, f.feature_name, r.reservation_date, r.is_standardized, f.feature_no    
        FROM CGM_DDB.locus l, CGM_DDB.feature f, CGM_DDB.gene_reservation r  
        WHERE l.locus_no = f.locus_no (+) and l.locus_no = r.locus_no(+) 
        ORDER BY l.locus_name 
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {

        #get primary dictyBaseid
        if ($result[9]) {
            $result[3] = $featdictyBaseid{$result[9]};
        }
        else {
            $result[3] = $locusdictyBaseid{$result[3]};
        }

	@result = $self->cleanup(@result);

        my $row = join("\t", @result[0 .. 8]);

        print OUT "$row\n";
    }

    close OUT;

    return;

} 

############################################################
sub get_IntronExon  {
############################################################

    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare("
        SELECT f.feature_name, l.locus_name, f.feature_no, 
               ft.feature_type, f.chromosome, f.start_coord,
               f.stop_coord, st.subfeature_type, s.start_coord,
               s.stop_coord
        FROM CGM_DDB.feature f, CGM_DDB.locus l, CGM_DDB.subfeature s,
             CGM_DDB.subfeature_type st, CGM_DDB.feature_type ft
        WHERE f.feature_no = ft.feature_no and 
              f.feature_no = s.feature_no and
              s.subfeature_no = st.subfeature_no and  
              f.locus_no = l.locus_no (+)
        ORDER BY f.feature_name, st.subfeature_type, s.start_coord,s.stop_coord
    ");

    $sth->execute;

    my $arrayRef = $sth->fetchall_arrayref();

    my $compare = '0,7,8,9';
    my $multi = '3';

    my $formatAref = $self->execute_advance_format($arrayRef, $compare, $multi);

    foreach my $lineAref (@$formatAref)   {

        my $featNo = $$lineAref[2];

        #get primarydictyBaseid
        $$lineAref[2] = $featdictyBaseid{$featNo};

        #remove non-standardized gene name
        if (defined $GeneNonStandard{$$lineAref[1]} ) {
	    $$lineAref[1] = '';
        }

	my @result = @$lineAref;
	@result = $self->cleanup(@result);

        my $row = join("\t", @result);

        print OUT "$row\n";
    }

    close OUT;

    return;    
}

############################################################
sub get_ProteinInfo {
############################################################
    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare("
        SELECT p.*, f.feature_name
        FROM CGM_DDB.protein_info p, CGM_DDB.feature f
        WHERE f.feature_no = p.feature_no
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {

        my $dictyBaseid;

        #get primary dictyBaseid
        if ($result[1]) {
            $dictyBaseid = $featdictyBaseid{$result[1]};
        }
        
        my @row = ($result[34], $dictyBaseid, @result[4 .. 10], @result[14 .. 33],
                   @result[11,12,13]) ;

	@row = $self->cleanup(@row);
       
        my $row = join("\t", @row);

        print OUT "$row\n";
    }

    close OUT;

    return; 
}

############################################################
sub get_ChromosomeLength {
############################################################
    my ($self, $file) = @_;

    open (OUT, ">$file") || die "Can't open $file:$!\n";

    my $sth = $dbh->prepare("
        SELECT c.chromosome, e.external_id, c.physical_length
        FROM CGM_DDB.chromosome c, CGM_DDB.external_id e
        WHERE e.tab_name = 'CHROMOSOME' and
              e.primary_key = c.chromosome
    ");

    $sth->execute;

    while (my @result = $sth->fetchrow())  {
	@result = $self->cleanup(@result);
	my $row = join("\t", @result);
        print OUT "$row\n";
    }

    close OUT;
    return;
}

############################################################
sub execute_format {
############################################################

    my ($self, $arrayRef) = @_;

    my @formatArray;

    my $row;   #hold the array of the first/previous result line

    foreach my $current (@$arrayRef) {

        if (!$row)  {
            $row = $current;
            next;
        }

        if ($$row[0] eq $$current[0]){

            for (my $i=1; $i<(scalar @$row); $i++) {

                if (($$row[$i] ne $$current[$i]) && ($$row[$i]) && ($$current[$i]))                {

                    #for duplication check
                    my $curr = $$current[$i];

                    if (!($$row[$i] =~ /$curr/)) {  
 
                        $$row[$i] = "$$row[$i]|$$current[$i]";
                    }
                }    
                elsif (!$$row[$i]) {

                    $$row[$i] = $$current[$i];    
                } 
           }
       }

       else {

           push (@formatArray, $row);

           $row = $current;
       }
    }

    push (@formatArray, $row);

    return \@formatArray;
} 

############################################################
sub execute_advance_format {
############################################################
    my ($self, $arrayRef, $compare, $multi) = @_;

    my @Compare;
    my @Multi;

    my @formatArray;

    if ($compare =~ /,/) {
        @Compare = split (',', $compare);
    }
    else {
        $Compare[0] = $compare;
    }

    if ($multi =~ /,/) {
        @Multi = split (',', $multi);
    }
    else {
        $Multi[0] = $multi;
    }

    my $row; #hold the array of the first/previous result line

    foreach my $current (@$arrayRef) {

        if (!$row)  {
            $row = $current;
            next;
        }

        my $check = 1;

        for (my $z=0; $z<@Compare; $z++)  {
	    my $column = $Compare[$z];
         
            if ( $$row[$column] ne $$current[$column] ) {

	        $check = 0;

                last;
            }
        }   

        if ($check == 1)  {

            for (my $j=0; $j<@Multi; $j++) {

	        my $index = $Multi[$j];

                if ( ($$row[$index] ne $$current[$index]) && ($$row[$index]) && ($$current[$index]) ) {

	            my $curr = $$current[$index];

                    if (!($$row[$index] =~ /$curr/)) {   
                        $$row[$index] = $$row[$index].'|'.$$current[$index];
	            }
                }
                elsif (!$$row[$index]) {
                    $$row[$index] = $$current[$index];    
                } 
             } 
         }

         else {
             push (@formatArray, $row);

             $row = $current;
         }
   }

    #the last unique row in the result set
    push (@formatArray, $row);

    return \@formatArray;
}

#get rid of extra new line and html tags
############################################################
sub cleanup {
############################################################
    my ($self, @array) = @_;
    for (my $i = 0; $i<@array; $i++) {

        #get rid of embedded html tags and unwanted white spaces.
	$array[$i] =~ s/<[^<>]+>//g;
	DeleteUnwantedChar(\$array[$i]);
    }
    return @array;
}


#give the compiler sth to run.
####################################################################
1;
####################################################################

=pod

=head1 Name

 /usr/local/dicty/www_dictybase/db/lib/dictyBase/Objects/Datadump.pm

=head1 Description

 This package contains all the necessary methods to generate dictyBase ftp 
 data dump files at /share/ftp/Dictyostelium/data_dump/. Each method is responsible 
 for generating one data_dump file, whose full path and file name are passed 
 in as a parameter. All files generated are tab_delimited. For details about 
 column names etc, check the README files under /share/ftp/Dictyostelium/data_dump/
 and its subdirectories.

=head1 Scripts using this package

 /share/dictyBase/bin/ftp_datadump.pl

=head1 Instantiating a New Reference Object

 my $obj = Datadump->new;

=head1 Accessor Methods

 Each accessor method takes the full path and file name as parameter and 
 generates a tab-delimited data dump file.

 Example: my $file = '/share/ftp/Dictyostelium/data_dump/external_id/external_id.tab';

=head2 get_external_id method

 Usage:    $obj->get_external_id($file);

=head2 get_goTerm method

 Usage:    $obj->get_goTerm($file);

=head2 get_goAnnotation method

 Usage:    $obj->get_goAnnotation($file);

=head2 get_phenotypes method

 Usage:    $obj->get_phenotypes($file);

=head2 get_geneticMap method

 Usage:    $obj->get_geneticMap($file);

=head2 get_chromFeature method

 Usage:    $obj->get_chromFeature($file);

=head2 get_annotationChange method

 Usage:    $obj->get_annotationChange($file);

=head2 get_clone method

 Usage:    $obj->get_clone($file);

=head2 get_sage method

 Usage:    $obj->get_sage($file);

=head2 get_geneReference method

 Usage:    $obj->get_geneReference($file);

=head2 get_locus method

 Usage:    $obj->get_locus($file);

=head1 Functions

 They are invoked by different class methods for data preparing, formatting etc.

=head2 get_dictyBaseid

 This function is executed when a new object gets initiated. It retrieves all
 available primary and non-primary dictyBaseids and stores them in hashes with  
 either locus_no or feature_no as hash key. These hashes are available to all 
 accessor methods to get corresponding dictyBaseid or dictyBaseid_other(s) with either a 
 locus_no or a feature_no.

=head2 execute_format

 This function accepts an array_reference as parameter and de-reference it. 
 Each element in this array is another array_reference referring to one row of 
 the specific data dump file. Each row is compared to its previous row. If they
 represent the same locus or feature, this two rows will be merged together.
 If the data of any two corresponding columns in these two rows are different,
 they will be concatenated and separated by '|'.

=head2 execute_advance_format

 This function works almost the same way as execute_format, except that this 
 function accepts another two parameters specifying which column to compare
 and which column to concatenate. For any two consecutive rows under 
 comparison, only the columns specified for comparison are compared. If they 
 are identical, the columns specified for concatenation get merged with '|' as 
 separator.

=head1 Author

Stan Dong

qdong@genome.stanford.edu

=cut
