librustc_unicode

author: pravic <[email protected]> 2016-04-12 17:45:15 +0300
committer: pravic <[email protected]> 2016-04-12 17:45:15 +0300
commit: 71bb406e75aebb9f7efbaf69dd8f6c73b559932c (patch)
tree: bbc0b9c127c63ced8437c1c2f237bbbd191f97f3 /librustc_unicode/u_str.rs
parent: libcollections (diff)
download: kmd-env-rs-71bb406e75aebb9f7efbaf69dd8f6c73b559932c.tar.xz
kmd-env-rs-71bb406e75aebb9f7efbaf69dd8f6c73b559932c.zip
1 files changed, 189 insertions, 0 deletions
diff --git a/librustc_unicode/u_str.rs b/librustc_unicode/u_str.rs
new file mode 100644
index 0000000..18734a6
--- /dev/null
+++ b/librustc_unicode/u_str.rs
@@ -0,0 +1,189 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Unicode-intensive string manipulations.
+//!
+//! This module provides functionality to `str` that requires the Unicode
+//! methods provided by the unicode parts of the CharExt trait.
+
+use core::char;
+use core::iter::Filter;
+use core::str::Split;
+
+/// An iterator over the non-whitespace substrings of a string,
+/// separated by any amount of whitespace.
+#[stable(feature = "split_whitespace", since = "1.1.0")]
+pub struct SplitWhitespace<'a> {
+    inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
+}
+
+/// Methods for Unicode string slices
+#[allow(missing_docs)] // docs in libcollections
+pub trait UnicodeStr {
+    fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
+    fn is_whitespace(&self) -> bool;
+    fn is_alphanumeric(&self) -> bool;
+    fn trim(&self) -> &str;
+    fn trim_left(&self) -> &str;
+    fn trim_right(&self) -> &str;
+}
+
+impl UnicodeStr for str {
+    #[inline]
+    fn split_whitespace(&self) -> SplitWhitespace {
+        fn is_not_empty(s: &&str) -> bool {
+            !s.is_empty()
+        }
+        let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
+
+        fn is_whitespace(c: char) -> bool {
+            c.is_whitespace()
+        }
+        let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
+
+        SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
+    }
+
+    #[inline]
+    fn is_whitespace(&self) -> bool {
+        self.chars().all(|c| c.is_whitespace())
+    }
+
+    #[inline]
+    fn is_alphanumeric(&self) -> bool {
+        self.chars().all(|c| c.is_alphanumeric())
+    }
+
+    #[inline]
+    fn trim(&self) -> &str {
+        self.trim_matches(|c: char| c.is_whitespace())
+    }
+
+    #[inline]
+    fn trim_left(&self) -> &str {
+        self.trim_left_matches(|c: char| c.is_whitespace())
+    }
+
+    #[inline]
+    fn trim_right(&self) -> &str {
+        self.trim_right_matches(|c: char| c.is_whitespace())
+    }
+}
+
+// https://tools.ietf.org/html/rfc3629
+static UTF8_CHAR_WIDTH: [u8; 256] = [
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
+0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
+4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
+];
+
+/// Given a first byte, determine how many bytes are in this UTF-8 character
+#[inline]
+pub fn utf8_char_width(b: u8) -> usize {
+    return UTF8_CHAR_WIDTH[b as usize] as usize;
+}
+
+/// Determines if a vector of `u16` contains valid UTF-16
+pub fn is_utf16(v: &[u16]) -> bool {
+    let mut it = v.iter();
+    macro_rules! next { ($ret:expr) => {
+            match it.next() { Some(u) => *u, None => return $ret }
+        }
+    }
+    loop {
+        let u = next!(true);
+
+        match char::from_u32(u as u32) {
+            Some(_) => {}
+            None => {
+                let u2 = next!(false);
+                if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
+                    return false;
+                }
+            }
+        }
+    }
+}
+
+/// Iterator adaptor for encoding `char`s to UTF-16.
+#[derive(Clone)]
+pub struct Utf16Encoder<I> {
+    chars: I,
+    extra: u16,
+}
+
+impl<I> Utf16Encoder<I> {
+    /// Create a UTF-16 encoder from any `char` iterator.
+    pub fn new(chars: I) -> Utf16Encoder<I>
+        where I: Iterator<Item = char>
+    {
+        Utf16Encoder {
+            chars: chars,
+            extra: 0,
+        }
+    }
+}
+
+impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
+    type Item = u16;
+
+    #[inline]
+    fn next(&mut self) -> Option<u16> {
+        if self.extra != 0 {
+            let tmp = self.extra;
+            self.extra = 0;
+            return Some(tmp);
+        }
+
+        self.chars.next().map(|ch| {
+            let n = CharExt::encode_utf16(ch);
+            let n = n.as_slice();
+            if n.len() == 2 {
+                self.extra = n[1];
+            }
+            n[0]
+        })
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (low, high) = self.chars.size_hint();
+        // every char gets either one u16 or two u16,
+        // so this iterator is between 1 or 2 times as
+        // long as the underlying iterator.
+        (low, high.and_then(|n| n.checked_mul(2)))
+    }
+}
+
+impl<'a> Iterator for SplitWhitespace<'a> {
+    type Item = &'a str;
+
+    fn next(&mut self) -> Option<&'a str> {
+        self.inner.next()
+    }
+}
+impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
+    fn next_back(&mut self) -> Option<&'a str> {
+        self.inner.next_back()
+    }
+}
author	pravic <[email protected]>	2016-04-12 17:45:15 +0300
committer	pravic <[email protected]>	2016-04-12 17:45:15 +0300
commit	71bb406e75aebb9f7efbaf69dd8f6c73b559932c (patch)
tree	bbc0b9c127c63ced8437c1c2f237bbbd191f97f3 /librustc_unicode/u_str.rs
parent	libcollections (diff)
download	kmd-env-rs-71bb406e75aebb9f7efbaf69dd8f6c73b559932c.tar.xz kmd-env-rs-71bb406e75aebb9f7efbaf69dd8f6c73b559932c.zip