diff options
| author | pravic <[email protected]> | 2016-04-12 17:45:15 +0300 |
|---|---|---|
| committer | pravic <[email protected]> | 2016-04-12 17:45:15 +0300 |
| commit | 71bb406e75aebb9f7efbaf69dd8f6c73b559932c (patch) | |
| tree | bbc0b9c127c63ced8437c1c2f237bbbd191f97f3 /librustc_unicode/u_str.rs | |
| parent | libcollections (diff) | |
| download | kmd-env-rs-71bb406e75aebb9f7efbaf69dd8f6c73b559932c.tar.xz kmd-env-rs-71bb406e75aebb9f7efbaf69dd8f6c73b559932c.zip | |
librustc_unicode
Diffstat (limited to 'librustc_unicode/u_str.rs')
| -rw-r--r-- | librustc_unicode/u_str.rs | 189 |
1 files changed, 189 insertions, 0 deletions
diff --git a/librustc_unicode/u_str.rs b/librustc_unicode/u_str.rs new file mode 100644 index 0000000..18734a6 --- /dev/null +++ b/librustc_unicode/u_str.rs @@ -0,0 +1,189 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Unicode-intensive string manipulations. +//! +//! This module provides functionality to `str` that requires the Unicode +//! methods provided by the unicode parts of the CharExt trait. + +use core::char; +use core::iter::Filter; +use core::str::Split; + +/// An iterator over the non-whitespace substrings of a string, +/// separated by any amount of whitespace. +#[stable(feature = "split_whitespace", since = "1.1.0")] +pub struct SplitWhitespace<'a> { + inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>, +} + +/// Methods for Unicode string slices +#[allow(missing_docs)] // docs in libcollections +pub trait UnicodeStr { + fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>; + fn is_whitespace(&self) -> bool; + fn is_alphanumeric(&self) -> bool; + fn trim(&self) -> &str; + fn trim_left(&self) -> &str; + fn trim_right(&self) -> &str; +} + +impl UnicodeStr for str { + #[inline] + fn split_whitespace(&self) -> SplitWhitespace { + fn is_not_empty(s: &&str) -> bool { + !s.is_empty() + } + let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer + + fn is_whitespace(c: char) -> bool { + c.is_whitespace() + } + let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer + + SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) } + } + + #[inline] + fn is_whitespace(&self) -> bool { + self.chars().all(|c| c.is_whitespace()) + } + + #[inline] + fn is_alphanumeric(&self) -> bool { + self.chars().all(|c| c.is_alphanumeric()) + } + + #[inline] + fn trim(&self) -> &str { + self.trim_matches(|c: char| c.is_whitespace()) + } + + #[inline] + fn trim_left(&self) -> &str { + self.trim_left_matches(|c: char| c.is_whitespace()) + } + + #[inline] + fn trim_right(&self) -> &str { + self.trim_right_matches(|c: char| c.is_whitespace()) + } +} + +// https://tools.ietf.org/html/rfc3629 +static UTF8_CHAR_WIDTH: [u8; 256] = [ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF +0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF +4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF +]; + +/// Given a first byte, determine how many bytes are in this UTF-8 character +#[inline] +pub fn utf8_char_width(b: u8) -> usize { + return UTF8_CHAR_WIDTH[b as usize] as usize; +} + +/// Determines if a vector of `u16` contains valid UTF-16 +pub fn is_utf16(v: &[u16]) -> bool { + let mut it = v.iter(); + macro_rules! next { ($ret:expr) => { + match it.next() { Some(u) => *u, None => return $ret } + } + } + loop { + let u = next!(true); + + match char::from_u32(u as u32) { + Some(_) => {} + None => { + let u2 = next!(false); + if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF { + return false; + } + } + } + } +} + +/// Iterator adaptor for encoding `char`s to UTF-16. +#[derive(Clone)] +pub struct Utf16Encoder<I> { + chars: I, + extra: u16, +} + +impl<I> Utf16Encoder<I> { + /// Create a UTF-16 encoder from any `char` iterator. + pub fn new(chars: I) -> Utf16Encoder<I> + where I: Iterator<Item = char> + { + Utf16Encoder { + chars: chars, + extra: 0, + } + } +} + +impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> { + type Item = u16; + + #[inline] + fn next(&mut self) -> Option<u16> { + if self.extra != 0 { + let tmp = self.extra; + self.extra = 0; + return Some(tmp); + } + + self.chars.next().map(|ch| { + let n = CharExt::encode_utf16(ch); + let n = n.as_slice(); + if n.len() == 2 { + self.extra = n[1]; + } + n[0] + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + let (low, high) = self.chars.size_hint(); + // every char gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low, high.and_then(|n| n.checked_mul(2))) + } +} + +impl<'a> Iterator for SplitWhitespace<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + self.inner.next() + } +} +impl<'a> DoubleEndedIterator for SplitWhitespace<'a> { + fn next_back(&mut self) -> Option<&'a str> { + self.inner.next_back() + } +} |