// Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file implements source, a buffered rune reader // which is specialized for the needs of the Go scanner: // Contiguous sequences of runes (literals) are extracted // directly as []byte without the need to re-encode the // runes in UTF-8 (as would be necessary with bufio.Reader). // // This file is self-contained (go tool compile source.go // compiles) and thus could be made into its own package. package syntax import ( "io" "unicode/utf8" ) // starting points for line and column numbers const linebase = 1 const colbase = 1 // buf [...read...|...|...unread...|s|...free...] // ^ ^ ^ ^ // | | | | // suf r0 r w type source struct { src io.Reader errh func(line, pos uint, msg string) // source buffer buf [4 << 10]byte offs int // source offset of buf r0, r, w int // previous/current read and write buf positions, excluding sentinel line0, line uint // previous/current line col0, col uint // previous/current column (byte offsets from line start) ioerr error // pending io error // literal buffer lit []byte // literal prefix suf int // literal suffix; suf >= 0 means we are scanning a literal } // init initializes source to read from src and to report errors via errh. // errh must not be nil. func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) { s.src = src s.errh = errh s.buf[0] = utf8.RuneSelf // terminate with sentinel s.offs = 0 s.r0, s.r, s.w = 0, 0, 0 s.line0, s.line = 0, linebase s.col0, s.col = 0, colbase s.ioerr = nil s.lit = s.lit[:0] s.suf = -1 } // ungetr ungets the most recently read rune. func (s *source) ungetr() { s.r, s.line, s.col = s.r0, s.line0, s.col0 } // ungetr2 is like ungetr but enables a 2nd ungetr. // It must not be called if one of the runes seen // was a newline. func (s *source) ungetr2() { s.ungetr() // line must not have changed s.r0-- s.col0-- } func (s *source) error(msg string) { s.errh(s.line0, s.col0, msg) } // getr reads and returns the next rune. // // If a read or source encoding error occurs, getr // calls the error handler installed with init. // The handler must exist. // // The (line, col) position passed to the error handler // is always at the current source reading position. func (s *source) getr() rune { redo: s.r0, s.line0, s.col0 = s.r, s.line, s.col // We could avoid at least one test that is always taken in the // for loop below by duplicating the common case code (ASCII) // here since we always have at least the sentinel (utf8.RuneSelf) // in the buffer. Measure and optimize if necessary. // make sure we have at least one rune in buffer, or we are at EOF for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) { s.fill() // s.w-s.r < len(s.buf) => buffer is not full } // common case: ASCII and enough bytes // (invariant: s.buf[s.w] == utf8.RuneSelf) if b := s.buf[s.r]; b < utf8.RuneSelf { s.r++ // TODO(gri) Optimization: Instead of adjusting s.col for each character, // remember the line offset instead and then compute the offset as needed // (which is less often). s.col++ if b == 0 { s.error("invalid NUL character") goto redo } if b == '\n' { s.line++ s.col = colbase } return rune(b) } // EOF if s.r == s.w { if s.ioerr != io.EOF { s.error(s.ioerr.Error()) } return -1 } // uncommon case: not ASCII r, w := utf8.DecodeRune(s.buf[s.r:s.w]) s.r += w s.col += uint(w) if r == utf8.RuneError && w == 1 { s.error("invalid UTF-8 encoding") goto redo } // BOM's are only allowed as the first character in a file const BOM = 0xfeff if r == BOM { if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1) s.error("invalid BOM in the middle of the file") } goto redo } return r } func (s *source) fill() { // Slide unread bytes to beginning but preserve last read char // (for one ungetr call) plus one extra byte (for a 2nd ungetr // call, only for ".." character sequence and float literals // starting with "."). if s.r0 > 1 { // save literal prefix, if any // (We see at most one ungetr call while reading // a literal, so make sure s.r0 remains in buf.) if s.suf >= 0 { s.lit = append(s.lit, s.buf[s.suf:s.r0]...) s.suf = 1 // == s.r0 after slide below } n := s.r0 - 1 copy(s.buf[:], s.buf[n:s.w]) s.offs += n s.r0 = 1 // eqv: s.r0 -= n s.r -= n s.w -= n } // read more data: try a limited number of times for i := 100; i > 0; i-- { n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel if n < 0 { panic("negative read") // incorrect underlying io.Reader implementation } s.w += n if n > 0 || err != nil { s.buf[s.w] = utf8.RuneSelf // sentinel if err != nil { s.ioerr = err } return } } s.ioerr = io.ErrNoProgress } func (s *source) startLit() { s.suf = s.r0 s.lit = s.lit[:0] // reuse lit } func (s *source) stopLit() []byte { lit := s.buf[s.suf:s.r] if len(s.lit) > 0 { lit = append(s.lit, lit...) } s.suf = -1 // no pending literal return lit }