// in `print.c`
#include <stdio.h> // printf
int main(int argc, char **argv) {
for (int i = 0; i < argc; i++) {
char *arg = argv[i];
// we don't know where to stop, so let's just print 15 characters.
for (int j = 0; j < 15; j++) {
char character = arg[j];
// the %c specifier is for characters
printf("%c", character);
}
printf("\n");
}
return 0;
}
#include <stdio.h> // printf
int main(int argc, char **argv) {
for (int i = 0; i < argc; i++) {
char *arg = argv[i];
// note: the loop condition is gone, we just loop forever.
// well, until a 'break' at least.
for (int j = 0;; j++) {
char character = arg[j];
// technically, we ought to use '\0' rather than just 0,
// but even `gcc -Wall -Wextra -Wpedantic` doesn't chastise
// us, so let's just go with it.
if (character == 0) {
break;
}
printf("%c", character);
}
printf("\n");
}
return 0;
}
$ gcc print.c -o print
$ ./print "ready" "set" "go"
./print
ready
set
go
// in `print.js`
const { argv, stdout } = process;
// we have to skip *two* arguments: the path to node,
// and the path to our script
for (const arg of argv.slice(2)) {
for (const character of arg) {
stdout.write(character);
stdout.write(" ");
}
stdout.write("\n");
}
$ node print.js "élément"
é l é m e n t
啊! 好多了!Node.js能正确转换为大写吗?
// in `print.js`
const { argv, stdout } = process;
for (const arg of argv.slice(2)) {
stdout.write(arg.toUpperCase());
stdout.write("\n");
}
不只是ASCII,而是ASCII加我们选择的128个字符。当然有很多语言,因此并非每种语言的非ASCII字符都可以容纳这些额外的128个值,因此对于那些大于127的值,有几种替代的解释。这些解释被称为“代码页”。上面的图片是Codepage 437,也称为CP437,OEM-US,OEM 437,PC-8或DOS Latin US。
// in `print.c`
#include <stdio.h> // printf
#include <stdint.h> // uint8_t
void print_spaced(char *s) {
// start at the beginning
int i = 0;
while (1) {
// we're going to be shifting bytes around,
// so treat them like unsigned 8-bit values
uint8_t c = s[i];
if (c == 0) {
// reached null terminator, stop printing
break;
}
// length of the sequence, ie., number of bytes
// that encode a single Unicode scalar value
int len = 1;
if (c >> 5 == 0b110) {
len = 2;
} else if (c >> 4 == 0b1110) {
len = 3;
} else if (c >> 3 == 0b11110) {
len = 4;
}
// print the entire UTF-8-encoded Unicode scalar value
for (; len > 0; len--) {
printf("%c", s[i]);
i++;
}
// print space separator
printf(" ");
}
}
int main(int argc, char **argv) {
for (int i = 1; i < argc; i++) {
print_spaced(argv[i]);
printf("\n");
}
return 0;
}
#include <ctype.h> // toupper
int main(int argc, char **argv) {
uint32_t scalars[1024]; // hopefully that's enough
decode_utf8(argv[1], scalars);
for (int i = 0;; i++) {
if (scalars[i] == 0) {
break;
}
printf("U+%04X ", scalars[i]);
}
printf("\n");
// this is the highest codepoint we can decode/encode successfully
const size_t table_size = 0b11111111111;
uint32_t lower_to_upper[table_size];
// initialize the table to just return the codepoint unchanged
for (uint32_t cp = 0; cp < table_size; cp++) {
lower_to_upper[cp] = cp;
}
// set a-z => A-Z
for (int c = 97; c <= 122; c++) { // ha.
lower_to_upper[(uint32_t) c] = (uint32_t) toupper(c);
}
// note: nested functions is a GNU extension!
void set(char *lower, char *upper) {
uint32_t lower_s[1024];
uint32_t upper_s[1024];
decode_utf8(lower, lower_s);
decode_utf8(upper, upper_s);
for (int i = 0;; i++) {
if (lower_s[i] == 0) {
break;
}
lower_to_upper[lower_s[i]] = upper_s[i];
}
}
// set a few more
set(
"éêèàâëüöïÿôîçæœ",
"ÉÊÈÀÂËÜÖÏŸÔÎÇÆŒ"
);
// now convert our scalars to upper-case
for (int i = 0;; i++) {
if (scalars[i] == 0) {
break;
}
scalars[i] = lower_to_upper[scalars[i]];
}
uint8_t result[1024]; // yolo
encode_utf8(scalars, result);
printf("%s\n", result);
return 0;
}
$ gcc upper.c -o upper
$ ./upper "Voix ambiguë d'un cœur qui, au zéphyr, préfère les jattes de kiwis"
U+0056 U+006F U+0069 U+0078 U+0020 U+0061 U+006D U+0062 U+0069 U+0067 U+0075 U+00EB U+0020 U+0064 U+0027 U+0075 U+006E U+0020 U+0063 U+0153 U+0075 U+0072 U+0020 U+0071 U+0075 U+0069 U+002C U+0020 U+0061 U+0075 U+0020 U+007A U+00E9 U+0070 U+0068 U+0079 U+0072 U+002C U+0020 U+0070 U+0072 U+00E9 U+0066 U+00E8 U+0072 U+0065 U+0020 U+006C U+0065 U+0073 U+0020 U+006A U+0061 U+0074 U+0074 U+0065 U+0073 U+0020 U+0064 U+0065 U+0020 U+006B U+0069 U+0077 U+0069 U+0073
VOIX AMBIGUË D'UN CŒUR QUI, AU ZÉPHYR, PRÉFÈRE LES JATTES DE KIWIS
传递字符串
首先,是C程序,C很容易!只需使用char *。
// in `woops.c`
#include <stdio.h>
int len(char *s) {
int l = 0;
while (s[l]) {
l++;
}
return l;
}
int main(int argc, char **argv) {
char *arg = argv[1];
int l = len(arg);
printf("length of \"%s\" = %d\n", arg, l);
}
$ # we're back into the parent of the "rustre" directory
$ # (in case you're following along)
$ gcc woops.c -o woops
$ ./woops "dog"
length of "dog" = 3
$ cargo run
Finished dev [unoptimized + debuginfo] target(s) in 0.01s
Running `target/debug/rustre`
thread 'main' panicked at 'should have one argument', src/libcore/option.rs:1188:5
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace.
好的!因此,当我们不传递参数时,运行程序会有如上输出。让我们传递一些测试字符串:
$ cargo run --quiet -- "noël"
NOËL
$ cargo run --quiet -- "trans rights"
TRANS RIGHTS
$ cargo run --quiet -- "voix ambiguë d'un cœur qui, au zéphyr, préfère les jattes de kiwis"
VOIX AMBIGUË D'UN CŒUR QUI, AU ZÉPHYR, PRÉFÈRE LES JATTES DE KIWIS
$ cargo run --quiet -- "heinz große"
HEINZ GROSSE
$ cargo run --quiet -- $(printf "\\xC3")
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: "\xC3"', src/libcore/result.rs:1188:5
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace.
查看错误堆栈信息。
$ RUST_BACKTRACE=1 cargo run --quiet -- $(printf "\\xC3")
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: "\xC3"', src/libcore/result.rs:1188:5
stack backtrace:
(cut)
13: core::result::unwrap_failed
at src/libcore/result.rs:1188
14: core::result::Result<T,E>::unwrap
at /rustc/5e1a799842ba6ed4a57e91f7ab9435947482f7d8/src/libcore/result.rs:956
15: <std::env::Args as core::iter::traits::iterator::Iterator>::next::{{closure}}
at src/libstd/env.rs:789
16: core::option::Option<T>::map
at /rustc/5e1a799842ba6ed4a57e91f7ab9435947482f7d8/src/libcore/option.rs:450
17: <std::env::Args as core::iter::traits::iterator::Iterator>::next
at src/libstd/env.rs:789
18: <&mut I as core::iter::traits::iterator::Iterator>::next
at /rustc/5e1a799842ba6ed4a57e91f7ab9435947482f7d8/src/libcore/iter/traits/iterator.rs:2991
19: core::iter::traits::iterator::Iterator::nth
at /rustc/5e1a799842ba6ed4a57e91f7ab9435947482f7d8/src/libcore/iter/traits/iterator.rs:323
20: <core::iter::adapters::Skip<I> as core::iter::traits::iterator::Iterator>::next
at /rustc/5e1a799842ba6ed4a57e91f7ab9435947482f7d8/src/libcore/iter/adapters/mod.rs:1657
21: rustre::main
at src/main.rs:2
(cut)
fn main() {
let arg = std::env::args()
.skip(1)
.next()
.expect("should have one argument");
for c in arg.chars() {
print!("{} ", c);
}
println!()
}
$ cargo run --quiet -- "cup of tea"
c u p o f t e a
很简单!让我们尝试使用非ASCII字符:
$ cargo run --quiet -- "23€ ≈ ¥2731"
2 3 € ≈ ¥ 2 7 3 1
$ cargo run --quiet -- "memory safety 🥺 please 🙏"
m e m o r y s a f e t y 🥺 p l e a s e 🙏
一切似乎都很好。如果我们要打印Unicode标量值的数字而不是它们的字形,该怎么办?
fn main() {
let arg = std::env::args()
.skip(1)
.next()
.expect("should have one argument");
for c in arg.chars() {
print!("{} (U+{:04X}) ", c, c as u32);
}
println!()
}
$ cargo run --quiet -- "aimée"
a (U+0061) i (U+0069) m (U+006D) é (U+00E9) e (U+0065)
酷!如果我们想显示其为UTF-8编码怎么办?我的意思是打印单个字节?
fn main() {
let arg = std::env::args()
.skip(1)
.next()
.expect("should have one argument");
for b in arg.bytes() {
print!("{:02X} ", b);
}
println!()
}
fn main() {
let arg = std::env::args()
.skip(1)
.next()
.expect("should have one argument");
println!("upp = {}", uppercase(arg));
println!("arg = {}", arg);
}
fn uppercase(s: String) -> String {
s.to_uppercase()
}
$ cargo build --quiet
error[E0382]: borrow of moved value: `arg`
--> src/main.rs:8:26
|
2 | let arg = std::env::args()
| --- move occurs because `arg` has type `std::string::String`, which does not implement the `Copy` trait
...
7 | println!("upp = {}", uppercase(arg));
| --- value moved here
8 | println!("arg = {}", arg);
| ^^^ value borrowed here after move
error: aborting due to previous error
For more information about this error, try `rustc --explain E0382`.
error: could not compile `rustre`.
fn uppercase(src: &str, dst: String) -> String {
for c in src.chars() {
for c in c.to_uppercase() {
dst.push(c);
}
}
dst
}
$ error[E0596]: cannot borrow `dst` as mutable, as it is not declared as mutable
--> src/main.rs:15:13
|
12 | fn uppercase(src: &str, dst: String) -> String {
| --- help: consider changing this to be mutable: `mut dst`
...
15 | dst.push(c);
| ^^^ cannot borrow as mutable
fn uppercase(src: &str, mut dst: String) {
for c in src.chars() {
for c in c.to_uppercase() {
dst.push(c);
}
}
}
cargo run --quiet -- "dog"
error[E0382]: borrow of moved value: `upp`
--> src/main.rs:10:26
|
7 | let upp = String::new();
| --- move occurs because `upp` has type `std::string::String`, which does not implement the `Copy` trait
8 | uppercase(&arg, upp);
| --- value moved here
9 |
10 | println!("upp = {}", upp);
| ^^^ value borrowed here after move
我们需要让upp可变地借用。
fn main() {
let arg = std::env::args()
.skip(1)
.next()
.expect("should have one argument");
let mut upp = String::new();
// was just `upp`
uppercase(&arg, &mut upp);
println!("upp = {}", upp);
println!("arg = {}", arg);
}
// was `mut dst: String`
fn uppercase(src: &str, dst: &mut String) {
for c in src.chars() {
for c in c.to_uppercase() {
dst.push(c);
}
}
}
fn main() {
for arg in std::env::args().skip(1) {
for i in 0..arg.len() {
println!("arg[{}] = {}", i, arg[i]);
}
}
}
$ cargo run --quiet -- "dog"
error[E0277]: the type `std::string::String` cannot be indexed by `usize`
--> src/main.rs:4:41
|
4 | println!("arg[{}] = {}", i, arg[i]);
| ^^^^^^ `std::string::String` cannot be indexed by `usize`
|
= help: the trait `std::ops::Index<usize>` is not implemented for `std::string::String`
我们不可以。我们可以先将其转换为Unicode标量值数组,然后对其进行索引:
fn main() {
for arg in std::env::args().skip(1) {
let scalars: Vec<char> = arg.chars().collect();
for i in 0..scalars.len() {
println!("arg[{}] = {}", i, scalars[i]);
}
}
}
$ cargo run --quiet -- "dog"
arg[0] = d
arg[1] = o
arg[2] = g
fn main() {
for arg in std::env::args().skip(1) {
let stripped = strip(&arg);
println!(" arg = {:?}", arg);
println!("stripped = {:?}", stripped);
}
}
fn strip(src: &str) -> &str {
let mut dst = &src[..];
while dst.starts_with(" ") {
dst = &dst[1..]
}
while dst.ends_with(" ") {
dst = &dst[..dst.len() - 1]
}
dst
}
而且效果也一样。不过,这似乎很危险。如果原始字符串的内存被释放怎么办?
fn main() {
let stripped;
{
let original = String::from(" floating in space ");
stripped = strip(&original);
}
println!("stripped = {:?}", stripped);
}
$ cargo run --quiet -- " floating in space "
error[E0597]: `original` does not live long enough
--> src/main.rs:5:26
|
5 | stripped = strip(&original);
| ^^^^^^^^^ borrowed value does not live long enough
6 | }
| - `original` dropped here while still borrowed
7 | println!("stripped = {:?}", stripped);
| -------- borrow later used here
在Rust中?编译器将检查所有的"恶作剧"。
最后,String用范围索引,很酷,但是..是字符范围吗?
fn main() {
for arg in std::env::args().skip(1) {
println!("first four = {:?}", &arg[..4]);
}
}
$ cargo run --quiet -- "want safety?"
first four = "want"
$ cargo run --quiet -- "🙈🙉🙊💥"
first four = "🙈"
fn main() {
for arg in std::env::args().skip(1) {
println!("first two = {:?}", &arg[..2]);
}
}
$ cargo run --quiet -- "🙈🙉🙊💥"
thread 'main' panicked at 'byte index 2 is not a char boundary; it is inside '🙈' (bytes 0..4) of `🙈🙉🙊💥`', src/libcore/str/mod.rs:2069:5
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace.