选取文件
更新时间:2023-05-18
SelectObject接口支持用户对BOS中指定格式(CSV/JSON/Parquet)的object内容执行SQL语句,通过SQL这种结构化查询语言对object内容进行筛选、分析、过滤之后再返回用户需要的文件内容。请参考选取Object
使用限制见“开发者指南“-”使用及管理数据”-“选取object”部分。
查询csv文件
示例:
Python
1 from baidubce import compat
2 import base64
3
4 # 方便理解,我们先上传一个简单的csv文件
5 csv_content = """
6 1,Maurits,2017-09-1216:32:57,685856330,-540265154.48,true
7 2,Iago,2018-02-01 12:25:01,-642946677,3781354659.89,false
8 3,Dionisio,2018-02-16 09:52:24,-3823711977,79336720.77,false
9 4,Aleen,2018-05-17 11:48:45,-3289131518,1499686289.41,false
10 5,Herschel,2019-06-04 02:28:37,3456163349,-3810272511.88,true
11 """
12 bos_client.put_object_from_string(bucket_name, key, csv_content)
13 # 设置select_object()接口的参数
14 select_object_args = {
15 "expressionType": "SQL",
16 "inputSerialization": {
17 "compressionType": "NONE",
18 "csv": {
19 "fileHeaderInfo": "NONE",
20 "recordDelimiter": "Cg==",
21 "fieldDelimiter": "LA==",
22 "quoteCharacter": "Ig==",
23 "commentCharacter": "Iw=="
24 }
25 },
26 "outputSerialization": {
27 "outputHeader": False,
28 "csv": {
29 "quoteFields": "ALWAYS",
30 "recordDelimiter": "Cg==",
31 "fieldDelimiter": "LA==",
32 "quoteCharacter": "Ig=="
33 }
34 },
35 "requestProgress": {
36 "enabled": True
37 }
38 }
39 # 设置查询的sql语句,需要经过base64编码的sql语句
40 sql_exp = "SELECT _1, _2, _6 FROM BosObject"
41 select_object_args["expression"] = compat.convert_to_string(base64.standard_b64encode(compat.convert_to_bytes(sql_exp)))
42 # 调用select object接口
43 select_response = bos_client.select_object(bucket_name, key, select_object_args)
44 # 获取返回结果的生成器
45 result = select_response.result()
46 for msg in result:
47 print(msg)
48 if msg.headers["message-type"] == "Records":
49 print("type: {}, heades: {}, payload: {}, crc: {}".format(msg.type, msg.headers, msg.payload, msg.crc))
50 elif msg.headers["message-type"] == "Cont":
51 print("type: {}, heades: {}, bytes_scanned: {}, bytes_returned: {}, crc: {}".format(msg.type, msg.headers,
52 msg.bytes_scanned, msg.bytes_returned, msg.crc))
53 else:
54 print("type: {}, heades: {}, crc: {}".format(msg.type, msg.headers, msg.crc))
查询json文件
示例:
Python
1 from baidubce import compat
2 import base64
3
4 # 方便理解,我们先上传一个简单的json文件
5 json_content = """
6 {
7 "name": "Smith",
8 "age": 16,
9 "weight": 65.5,
10 "org": null,
11 "projects":
12 [
13 {"project_name":"project1", "completed":false},
14 {"project_name":"project2", "completed":true}
15 ]
16 }
17 """
18 bos_client.put_object_from_string(bucket_name, key, json_content)
19 # 设置select_object()接口的参数
20 select_object_args = {
21 "expressionType": "SQL",
22 "inputSerialization": {
23 "compressionType": "NONE",
24 "json": {
25 "type": "DOCUMENT"
26 }
27 },
28 "outputSerialization": {
29 "json": {
30 "recordDelimiter": "Cg=="
31 }
32 },
33 "requestProgress": {
34 "enabled": True
35 }
36 }
37 # 设置查询的sql语句,需要经过base64编码的sql语句
38 sql_exp = "select projects from BosObject where name='Smith'"
39 select_object_args["expression"] = compat.convert_to_string(base64.standard_b64encode(compat.convert_to_bytes(sql_exp)))
40 # 调用select object接口
41 select_response = bos_client.select_object(bucket_name, key, select_object_args)
42 # 获取返回结果的生成器
43 result = select_response.result()
44 for msg in result:
45 print(msg)
46 if msg.headers["message-type"] == "Records":
47 print("type: {}, heades: {}, payload: {}, crc: {}".format(msg.type, msg.headers, msg.payload, msg.crc))
48 elif msg.headers["message-type"] == "Cont":
49 print("type: {}, heades: {}, bytes_scanned: {}, bytes_returned: {}, crc: {}".format(msg.type, msg.headers,
50 msg.bytes_scanned, msg.bytes_returned, msg.crc))
51 else:
52 print("type: {}, heades: {}, crc: {}".format(msg.type, msg.headers, msg.crc))
查询Parquet文件
Python
1 from baidubce import compat
2 import base64
3
4 # 我们先上传一个简单的Parquet文件
5 '''
6 parquet文件解析内容
7 {"Name":"StudentName","Age":20,"Id":0,"Weight":50,"Sex":true,"Day":19240,"Scores":{"computer":80,"math":90,"physics":90}}
8 {"Name":"StudentName","Age":21,"Id":1,"Weight":50.1,"Sex":false,"Day":19240,"Scores":{"computer":81,"math":91,"physics":91}}
9 {"Name":"StudentName","Age":22,"Id":2,"Weight":50.2,"Sex":true,"Day":19240,"Scores":{"computer":82,"math":92,"physics":92}}
10 {"Name":"StudentName","Age":23,"Id":3,"Weight":50.3,"Sex":false,"Day":19240,"Scores":{"computer":83,"math":93,"physics":90}}
11 {"Name":"StudentName","Age":24,"Id":4,"Weight":50.4,"Sex":true,"Day":19240,"Scores":{"computer":84,"math":94,"physics":91}}
12 {"Name":"StudentName","Age":20,"Id":5,"Weight":50.5,"Sex":false,"Day":19240,"Scores":{"computer":85,"math":90,"physics":92}}
13 {"Name":"StudentName","Age":21,"Id":6,"Weight":50.6,"Sex":true,"Day":19240,"Scores":{"computer":86,"math":91,"physics":90}}
14 {"Name":"StudentName","Age":22,"Id":7,"Weight":50.7,"Sex":false,"Day":19240,"Scores":{"computer":87,"math":92,"physics":91}}
15 {"Name":"StudentName","Age":23,"Id":8,"Weight":50.8,"Sex":true,"Day":19240,"Scores":{"computer":88,"math":93,"physics":92}}
16 {"Name":"StudentName","Age":24,"Id":9,"Weight":50.9,"Sex":false,"Day":19240,"Scores":{"computer":89,"math":94,"physics":90}}
17 '''
18 bos_client.put_object_from_file(bucket_name, key, parquet_file_name)
19 # 设置select_object()接口的参数
20 select_object_args = {
21 "expressionType": "SQL",
22 "inputSerialization": {
23 "compressionType": "NONE",
24 "parquet": {}
25 },
26 "outputSerialization": {
27 "json": {
28 "recordDelimiter": "Cg=="
29 }
30 },
31 "requestProgress": {
32 "enabled": false
33 }
34 }
35 # 设置查询的sql语句,需要经过base64编码的sql语句
36 sql_exp = "select * from BosObject s where s.Scores.computer > 85"
37 select_object_args["expression"] = compat.convert_to_string(base64.standard_b64encode(compat.convert_to_bytes(sql_exp)))
38 # 调用select object接口
39 select_response = bos_client.select_object(bucket_name, key, select_object_args)
40 # 获取返回结果的生成器
41 result = select_response.result()
42 for msg in result:
43 print(msg)
44 if msg.headers["message-type"] == "Records":
45 print("type: {}, heades: {}, payload: {}, crc: {}".format(msg.type, msg.headers, msg.payload, msg.crc))
46 elif msg.headers["message-type"] == "Cont":
47 print("type: {}, heades: {}, bytes_scanned: {}, bytes_returned: {}, crc: {}".format(msg.type, msg.headers,
48 msg.bytes_scanned, msg.bytes_returned, msg.crc))
49 else:
50 print("type: {}, heades: {}, crc: {}".format(msg.type, msg.headers, msg.crc))