黄色在线不卡,bnb99成人A片

在測(cè)試hive的load性能時(shí)，我們?cè)诮ū頃r(shí)指定使用|作為分隔符。這樣就需要考慮一個(gè)問(wèn)題：如果外部文本中|作為文本內(nèi)容出現(xiàn)時(shí)，如何區(qū)分到底是分隔符還是文本內(nèi)容。

首先測(cè)試hive是否能智能區(qū)分分隔符與文本內(nèi)容，結(jié)果表明：當(dāng)分隔符與文本內(nèi)容相同時(shí)，會(huì)產(chǎn)生數(shù)據(jù)混淆問(wèn)題。

SQL如何解決這一問(wèn)題

參考鏈接：如何在sqlldr中導(dǎo)入多字符分隔符文件

SQL中對(duì)此的解決辦法是采用多字符分隔符，以降低產(chǎn)生數(shù)據(jù)混淆的概率。

Hive中對(duì)這一問(wèn)題的解決思路與SQL相同

然而hive中默認(rèn)只支持單字符分割符，如果指定多字符分隔符將會(huì)報(bào)錯(cuò)。

參考鏈接：hive如何處理多分隔符數(shù)據(jù)
hive處理日志，自定義inputformat
hive創(chuàng)建表指定分隔符，不支持多個(gè)字符作為分隔符

解決辦法：

利用hive自帶的序列化/反序列化的方式RegexSe
重寫(xiě)相應(yīng)的InputFormat和OutputFormat方法

1. 利用hive自帶的序列化/反序列化的方式RegexSe
這種方式稍微復(fù)雜一點(diǎn)，對(duì)數(shù)據(jù)的控制能力也要弱一些，它使用正則表達(dá)式來(lái)匹配和處理數(shù)據(jù)，性能也會(huì)有所影響。但它的優(yōu)點(diǎn)是可以自定義表屬性信息 SERDEPROPERTIES ，在 SerDe 中通過(guò)這些屬性信息可以有更多的定制行為。

/*樣例數(shù)據(jù)*/
110|#警察
120|#醫(yī)院
/*建表*/ 
add jar /home/cup/software/……/hive-contrib-0.10.0-cdh4.4.0.jar;
create table test
(
id string,
name string
)partitioned by (c_day string)
row format serde 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
with serdeproperties
( 'input.regex' = '([^\\|#]*)\\|#([^\\|#]*)' , 'output.format.string' = '%1$s%2$s')
stored as textfile;
 /*load*/
load data local inpath '/……/test.txt'  overwrite into table test partition(c_day = '20141027');
/*查詢(xún)結(jié)果*/
select * from test;
110 警察 20141027
120 醫(yī)院 20141027

2. 自定義 outputformat 和 inputformat
Hive 的 outputformat/inputformat 與 hadoop 的 outputformat/inputformat 相當(dāng)類(lèi)似， inputformat 負(fù)責(zé)把輸入數(shù)據(jù)進(jìn)行格式化，然后提供給 Hive，outputformat 負(fù)責(zé)把 Hive 輸出的數(shù)據(jù)重新格式化成目標(biāo)格式再輸出到文件，這種對(duì)格式進(jìn)行定制的方式較為底層，對(duì)其進(jìn)行定制也相對(duì)簡(jiǎn)單，重寫(xiě) InputFormat 中 RecordReader 類(lèi)中的 next 方法即可。

/*樣例數(shù)據(jù)*/
2010-05-31 10:50:17|||61.132.4.82|||http://www.360buy.com/product/201185.html 
/*分隔符是“ ||| ”，這是為了盡可能防止日志正文出現(xiàn)與分隔符相同的字符而導(dǎo)致數(shù)據(jù)混淆。 hive 的內(nèi)部分隔符是“ \001 ”，所以我們需要做一下轉(zhuǎn)換 */
/*編寫(xiě)自定義InputFormat */
package com.jd.cloud.clickstore;    
import java.io.IOException;    
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapred.FileSplit;  
import org.apache.hadoop.mapred.InputSplit;  
import org.apache.hadoop.mapred.JobConf;  
import org.apache.hadoop.mapred.JobConfigurable;  
import org.apache.hadoop.mapred.RecordReader;  
import org.apache.hadoop.mapred.Reporter;  
import org.apache.hadoop.mapred.TextInputFormat;    
/** 
 * 自定義hadoop的 org.apache.hadoop.mapred.InputFormat 
 *  
 * @author winston 
 *  
 */  
public class ClickstreamInputFormat extends TextInputFormat implements  
        JobConfigurable {    
    public RecordReader<LongWritable, Text> getRecordReader(  
            InputSplit genericSplit, JobConf job, Reporter reporter)  
            throws IOException {    
        reporter.setStatus(genericSplit.toString());  
        return new ClickstreamRecordReader(job, (FileSplit) genericSplit);  
    }  
}  
/*自定義ClickstreamRecordReader實(shí)現(xiàn)RecordReader接口，并重寫(xiě)next方法 */
/** Read a line. */  
  public synchronized boolean next(LongWritable key, Text value)  
    throws IOException {    
    while (pos < end) {  
      key.set(pos);    
      int newSize = in.readLine(value, maxLineLength,  
                                Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),  
                                         maxLineLength));          
      //start  
      String strReplace = value.toString().toLowerCase().replaceAll("\\|\\|\\|" , "\001" );  
      Text txtReplace = new Text();  
      txtReplace.set(strReplace );  
      value.set(txtReplace.getBytes(), 0, txtReplace.getLength());  
      //end
      if (newSize == 0) {  
        return false;  
      }  
      pos += newSize;  
      if (newSize < maxLineLength) {  
        return true;  
      }    
      // line too long. try again  
      LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));  
    }    
    return false;  
  }  
/*我們可以直接使用LineRecordReader，修改next方法 */
/*啟動(dòng)hive，添加我們自己剛剛添加的類(lèi) */
/*創(chuàng)建數(shù)據(jù)庫(kù)*/
/*自定義 outputformat/inputformat 后，在建表時(shí)需要指定 outputformat/inputformat */
create table clickstream_table(time string, ip string, url string) stored as INPUTFORMAT 'com.jd.cloud.clickstore.ClickstreamInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION '/data/clickstream_20110216.txt';  
/*LOAD數(shù)據(jù) */
LOAD DATA LOCAL INPATH '/data/clickstream_20110216.txt' OVERWRITE INTO TABLE clickstream_table; 
/*查詢(xún)剛剛LOAD的數(shù)據(jù)*/
select * from clickstream_table;

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

Hive load外部文件時(shí)如何區(qū)別分隔符與文本內(nèi)容

Hive load外部文件時(shí)如何區(qū)別分隔符與文本內(nèi)容

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

Hive load外部文件時(shí)如何區(qū)別分隔符與文本內(nèi)容

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av