001/** 002 * Copyright (C) 2010-2015 The Roslin Institute <contact andy.law@roslin.ed.ac.uk> 003 * 004 * This file is part of JEnsembl: a Java API to Ensembl data sources developed by the 005 * Bioinformatics Group at The Roslin Institute, The Royal (Dick) School of 006 * Veterinary Studies, University of Edinburgh. 007 * 008 * Project hosted at: http://jensembl.sourceforge.net 009 * 010 * This is free software: you can redistribute it and/or modify 011 * it under the terms of the GNU General Public License (version 3) as published by 012 * the Free Software Foundation. 013 * 014 * This software is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 017 * GNU General Public License for more details. 018 * 019 * You should have received a copy of the GNU General Public License 020 * in this software distribution. If not, see: http://opensource.org/licenses/gpl-3.0.html 021 */ 022package uk.ac.roslin.ensembl.demo; 023 024import java.io.BufferedReader; 025import java.io.File; 026import java.io.FileReader; 027import java.util.StringTokenizer; 028import uk.ac.roslin.ensembl.config.DBConnection.DataSource; 029import uk.ac.roslin.ensembl.dao.database.DBRegistry; 030import uk.ac.roslin.ensembl.model.core.Chromosome; 031import uk.ac.roslin.ensembl.model.core.Species; 032import uk.ac.roslin.ensembl.model.database.Registry; 033 034/* 035 * Christelles's request: see rt 5216 036 * 20120208 037* >As discussed today, I would like to use the Ensembl Java API to access genomic data 038* >from the human hg19 genome (for now). The human hg19 assembly version is GRCh37.p5 039* >(Feb 2009) and the Ensembl database version is 65.37. 040* >I would like to extract DNA sequences around single genomic locations, 041* >say [+200, *single position*, -100], single genomic positions are summarised 042* >in .bed files. Please find attached an example BED file for test purposes 043* >with the following format for the 5 columns separated by tabulation (\t): 044* >Chromosome name\tstart\tend\t*single position*\tstrand. 045* >The columns of interest to me here are column 1, 4 and 5 ("+" referring to 046* >forward and "-" to reverse strand). 047* 048* When developing this it became clear i had to implement retrieval of PAR regions on sex chromosome. 049* see checkin SVN 247 050*/ 051 052 053public class UserChristelle { 054 055 056 //Script developed for user: parses chromosomal regions (gene locations) 057 //specified in a local BED data file, uses the JEnsembl API to retrieve 058 //sequence data flanking the given locations. 059 //fetching sequences from 200 5' to 100 3' of regions given in a BED file 060 public static void main(String[] args) throws Exception { 061 062 063 String fname = "src/main/resources/example_human_hg19_dataset.bed"; 064 065 File f = new File(fname); 066 067 if (!f.canRead()) { 068 System.out.println("cant read file"); 069 System.exit(-1); 070 } 071 072 Registry eReg = DBRegistry.createRegistryForDataSource(DataSource.ENSEMBLDB); 073 Species sp = eReg.getSpeciesByAlias("human"); 074 075 try { 076 BufferedReader reader = new BufferedReader(new FileReader(f)); 077 078 String line; 079 080 //loop through all the lines till we return null at the end 081 while ((line = reader.readLine()) != null) { 082 083 084 //skip over empty lines 085 if (line.isEmpty() || line.startsWith("--")) { 086 continue; 087 } 088 089 StringTokenizer tokens = new StringTokenizer(line); 090 091 if (tokens.countTokens()!=5) { 092 continue; 093 } 094 095 096 String chrName = tokens.nextToken().replace("chr", ""); 097 098 if (chrName.equalsIgnoreCase("Y")) { 099 System.out.println(""); 100 } 101 102 tokens.nextToken(); 103 tokens.nextToken(); 104 String position = tokens.nextToken(); 105 106 int location = Integer.parseInt(position); 107 String strand = tokens.nextToken(); 108 109 //release 76 changes assembly used... 110 //so 111 Chromosome chr = sp.getChromosomeByName(chrName, "75"); 112 113 if (strand.equalsIgnoreCase("+")) { 114 System.out.println("Chromosome "+chrName+": "+location+" "+strand); 115 System.out.println(chr.getSequenceAsString(location-100, location+200)); 116 System.out.println(""); 117 } else if (strand.equalsIgnoreCase("-")) { 118 System.out.println("Chromosome "+chrName+": "+location+" "+strand); 119 System.out.println(chr.getReverseComplementSequenceAsString(location-200, location+100)); 120 System.out.println(""); 121 } 122 123 } 124 } catch (Exception e) { 125 System.out.println("cant parse file"); 126 System.exit(-1); 127 } 128 129 130 System.out.println("\n\n*****************************\n* COMPLETED FUNCTIONAL TEST *\n*****************************\n"); 131 132 } 133 134 135}