Source code

001/**
002 * Copyright (C) 2010-2015 The Roslin Institute <contact andy.law@roslin.ed.ac.uk>
003 *
004 * This file is part of JEnsembl: a Java API to Ensembl data sources developed by the
005 * Bioinformatics Group at The Roslin Institute, The Royal (Dick) School of
006 * Veterinary Studies, University of Edinburgh.
007 *
008 * Project hosted at: http://jensembl.sourceforge.net
009 *
010 * This is free software: you can redistribute it and/or modify
011 * it under the terms of the GNU General Public License (version 3) as published by
012 * the Free Software Foundation.
013 *
014 * This software is distributed in the hope that it will be useful,
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017 * GNU General Public License for more details.
018 *
019 * You should have received a copy of the GNU General Public License
020 * in this software distribution. If not, see: http://opensource.org/licenses/gpl-3.0.html
021 */
022package uk.ac.roslin.ensembl.demo;
023
024import java.io.BufferedReader;
025import java.io.File;
026import java.io.FileReader;
027import java.util.StringTokenizer;
028import uk.ac.roslin.ensembl.config.DBConnection.DataSource;
029import uk.ac.roslin.ensembl.dao.database.DBRegistry;
030import uk.ac.roslin.ensembl.model.core.Chromosome;
031import uk.ac.roslin.ensembl.model.core.Species;
032import uk.ac.roslin.ensembl.model.database.Registry;
033
034/*
035 * Christelles's request: see rt 5216
036 * 20120208
037* >As discussed today, I would like to use the Ensembl Java API to access genomic data 
038* >from the human hg19 genome (for now). The human hg19 assembly version is GRCh37.p5 
039* >(Feb 2009) and the Ensembl database version is 65.37.
040* >I would like to extract DNA sequences around single genomic locations, 
041* >say [+200, *single position*, -100], single genomic positions are summarised 
042* >in .bed files. Please find attached an example BED file for test purposes 
043* >with the following format for the 5 columns separated by tabulation (\t): 
044* >Chromosome name\tstart\tend\t*single position*\tstrand. 
045* >The columns of interest to me here are column 1, 4 and 5 ("+"  referring to 
046* >forward and "-" to reverse strand).
047*
048* When developing this it became clear i had to implement retrieval of PAR regions on sex chromosome.
049* see checkin SVN 247 
050*/
051
052
053public class UserChristelle {
054    
055    
056    //Script developed for user: parses chromosomal regions (gene locations) 
057    //specified in a local BED data file, uses the JEnsembl API to retrieve 
058    //sequence data flanking the given locations.
059    //fetching sequences from 200 5' to 100 3' of regions given in a BED file
060    public static void main(String[] args) throws Exception {
061
062
063       String fname = "src/main/resources/example_human_hg19_dataset.bed";
064
065       File f = new File(fname);
066
067       if (!f.canRead()) {
068            System.out.println("cant read file");
069            System.exit(-1);
070        }
071       
072        Registry eReg = DBRegistry.createRegistryForDataSource(DataSource.ENSEMBLDB);
073        Species sp = eReg.getSpeciesByAlias("human");
074
075        try {
076            BufferedReader reader = new BufferedReader(new FileReader(f));
077
078            String line;
079
080            //loop through all the lines till we return null at the end
081            while ((line = reader.readLine()) != null) {
082
083
084                //skip over empty lines
085                if (line.isEmpty() || line.startsWith("--")) {
086                    continue;
087                }
088
089                StringTokenizer tokens = new StringTokenizer(line);
090                
091                if (tokens.countTokens()!=5) {
092                    continue;
093                }
094                
095                
096                String chrName = tokens.nextToken().replace("chr", "");
097                
098                if (chrName.equalsIgnoreCase("Y")) {
099                    System.out.println("");
100                }
101                
102                tokens.nextToken();
103                tokens.nextToken();
104                String position = tokens.nextToken();
105                
106                int location = Integer.parseInt(position);
107                String strand = tokens.nextToken();
108                
109                //release 76 changes assembly used...
110                //so 
111                Chromosome chr = sp.getChromosomeByName(chrName, "75");
112                
113                if (strand.equalsIgnoreCase("+")) {
114                    System.out.println("Chromosome "+chrName+": "+location+" "+strand);
115                    System.out.println(chr.getSequenceAsString(location-100, location+200));
116                    System.out.println("");
117                } else if (strand.equalsIgnoreCase("-")) {
118                    System.out.println("Chromosome "+chrName+": "+location+" "+strand);
119                    System.out.println(chr.getReverseComplementSequenceAsString(location-200, location+100));
120                    System.out.println("");
121                }
122
123            }
124        } catch (Exception e) {
125            System.out.println("cant parse file");
126            System.exit(-1);
127        }
128
129        
130        System.out.println("\n\n*****************************\n* COMPLETED FUNCTIONAL TEST *\n*****************************\n");
131
132    }
133
134
135}