001 /*
002 * CharSequence.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034 import java.io.Reader;
035 import java.io.BufferedReader;
036 import java.io.IOException;
037
038 /**
039 * This class implements a sequence of characters stored as an array that provides random
040 * access to any position in constant time.
041 *
042 * <P>The input can come from any source, provided it is encapsulated in a proper
043 * <CODE>Reader</CODE> instance. The stream is expected to be ready (i.e. the next
044 * <CODE>read</CODE> operation must return the first character of the sequence) and it is
045 * not closed when its end is reached, so the client is allowed to reset it and maybe use
046 * it for another purpose.</P>
047 *
048 * <P>Sequences can contain letters only although lines started with the
049 * <CODE>COMMENT_CHAR</CODE> character ('>') are regarded as comments and are completely
050 * skipped. White spaces (including tabs, line feeds and carriage returns) are also
051 * ignored throughout.</P>
052 *
053 * <P>This class is used by two sequence alignment algorithms: {@linkplain SmithWaterman}
054 * and {@linkplain NeedlemanWunsch}.</P>
055 *
056 * @author Sergio A. de Carvalho Jr.
057 * @see SmithWaterman
058 * @see NeedlemanWunsch
059 */
060 public class CharSequence
061 {
062 /**
063 * The character used to start a comment line in a sequence file. When this character
064 * is found, the rest of the line is ignored.
065 */
066 protected static final char COMMENT_CHAR = '>';
067
068 /**
069 * Stores the sequence as an array of characters.
070 */
071 protected char sequence[];
072
073 /**
074 * Creates a new instance of a <CODE>CharSequence</CODE>, loading the sequence data
075 * from the <CODE>Reader</CODE> input stream.
076 *
077 * @param reader source of characters for this sequence
078 * @throws IOException if an I/O exception occurs when reading the input
079 * @throws InvalidSequenceException if the input does not contain a valid sequence
080 */
081 public CharSequence (Reader reader) throws IOException, InvalidSequenceException
082 {
083 int ch;
084 char c;
085
086 BufferedReader input = new BufferedReader(reader);
087
088 StringBuffer buf = new StringBuffer();
089
090 // read characters
091 while ((ch = input.read()) != -1)
092 {
093 // conver to char
094 c = (char) ch;
095
096 // skip line if comment character is found
097 if (c == COMMENT_CHAR)
098 input.readLine();
099
100 // accept letters only
101 else if (Character.isLetter(c))
102 buf.append(c);
103
104 // anything else, except whitespaces, will throw an exception
105 else if (!Character.isWhitespace(c))
106 throw new InvalidSequenceException
107 ("Sequences can contain letters only.");
108 }
109
110 // check if read anything!
111 if (buf.length() > 0)
112 sequence = new char[buf.length()];
113 else
114 throw new InvalidSequenceException ("Empty sequence.");
115
116 // copy data to
117 buf.getChars(0, buf.length(), sequence, 0);
118 }
119
120 /**
121 * Returns the number of characters of this sequence.
122 *
123 * @return int number of characters of this sequence
124 */
125 public int length ()
126 {
127 return sequence.length;
128 }
129
130 /**
131 * Returns the character at a given position. For the client, the first character is
132 * at position 1, while the last character is at position <CODE>length()</CODE>. This
133 * is convinient for sequence alignment algorithms based on a classic dynamic
134 * programming matrix since the sequences usually start at row/column 1. This method
135 * does not check boundaries, therefore an <CODE>ArrayIndexOutOfBoundsException</CODE>
136 * may be raised if <CODE>pos</CODE> is out of bounds.
137 *
138 * @param pos position of character (from 1 to <CODE>length()</CODE> inclusive)
139 * @return the character
140 */
141 public char charAt (int pos)
142 {
143 // convert from one-based to zero-based index
144 return sequence[pos-1];
145 }
146
147 /**
148 * Returns a string representation of the sequence.
149 *
150 * @return a string representation of the sequence
151 */
152 public String toString ()
153 {
154 return new String(sequence);
155 }
156 }