hive 函数

date_format ()

用法

select weekofyear('2022-01-01 12:20:20','yyyy-MM-dd')

返回值

2022-01-01

函数含义

对输入的日期进行格式化

源码地址

org.apache.hadoop.hive.ql.udf.generic.GenericUDFDateFormat

源代码

@Description(name = "date_format", value = "_FUNC_(date/timestamp/string, fmt) - converts a date/timestamp/string "
    + "to a value of string in the format specified by the date format fmt.",
    extended = "Supported formats are SimpleDateFormat formats - "
        + "https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html. "
        + "Second argument fmt should be constant.\n"
        + "Example: > SELECT _FUNC_('2015-04-08', 'y');\n '2015'")
public class GenericUDFDateFormat extends GenericUDF {
  // 一个对象转换器
  private transient Converter[] tsConverters = new Converter[2];
  // 原始类型类别
  private transient PrimitiveCategory[] tsInputTypes = new PrimitiveCategory[2];
  private transient Converter[] dtConverters = new Converter[2];
  private transient PrimitiveCategory[] dtInputTypes = new PrimitiveCategory[2];
  private final java.util.Date date = new java.util.Date();
  // 使用hadoop Text 用于输出
  private final Text output = new Text();
  private transient SimpleDateFormat formatter;

  @Override
  public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {

    // 检查输入的参数最大与最小个数
    checkArgsSize(arguments, 2, 2);

    // 校验输入的参数类型是不是hive的原始数据类型
    checkArgPrimitive(arguments, 0);
    checkArgPrimitive(arguments, 1);



    // the function should support both short date and full timestamp format
    // time part of the timestamp should not be skipped
    // 校验数据类型,并保存到 tsInputTypes 、 tsInputTypes 、tsInputTypes 当中
    // 输入类型必须是 string date 输出类型为 string
    checkArgGroups(arguments, 0, tsInputTypes, STRING_GROUP, DATE_GROUP);
    checkArgGroups(arguments, 0, dtInputTypes, STRING_GROUP, DATE_GROUP);

    checkArgGroups(arguments, 1, tsInputTypes, STRING_GROUP);

    // 对于输入的Timestamp 与 Date 获取类型转换器
    obtainTimestampConverter(arguments, 0, tsInputTypes, tsConverters);
    obtainDateConverter(arguments, 0, dtInputTypes, dtConverters);

    if (arguments[1] instanceof ConstantObjectInspector) {
      // 获取date_format() 函数的 要解析的格式
      String fmtStr = getConstantStringValue(arguments, 1);
      if (fmtStr != null) {
        try {
          // 使用java 的 simpleDateFormat  对输入的格式进行转换并设置时区为UTC
          formatter = new SimpleDateFormat(fmtStr);
          formatter.setTimeZone(TimeZone.getTimeZone("UTC"));
        } catch (IllegalArgumentException e) {
          // ignore
        }
      }
    } else {
      // 当fmtStr 为null 的 时候直接抛出异常
      throw new UDFArgumentTypeException(1, getFuncName() + " only takes constant as "
          + getArgOrder(1) + " argument");
    }

    ObjectInspector outputOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
    return outputOI;
  }

  @Override
  public Object evaluate(DeferredObject[] arguments) throws HiveException {
    if (formatter == null) {
      return null;
    }
    // the function should support both short date and full timestamp format
    // time part of the timestamp should not be skipped
    // 把输入的第一个参数与类型转换器 获取到 Timestamp

    Timestamp ts = getTimestampValue(arguments, 0, tsConverters);
    if (ts == null) { // 如果输入的类型不是Timestamp 就是 Date了
      // 根据类型转换器 记牌器date
      Date d = getDateValue(arguments, 0, dtInputTypes, dtConverters);
      if (d == null) {
        return null;
      }
      // 把date 转换成 ts
      ts = Timestamp.ofEpochMilli(d.toEpochMilli());
    }
    // 赋值给 date
    date.setTime(ts.toEpochMilli());
    // 进行格式化
    String res = formatter.format(date);
    if (res == null) {
      return null;
    }
    // 赋值给 output
    output.set(res);
    return output;
  }

  @Override
  public String getDisplayString(String[] children) {
    return getStandardDisplayString(getFuncName(), children);
  }

  @Override
  protected String getFuncName() {
    return "date_format";
  }
}

源码分析

  1. initialize(ObjectInspector[] arguments) 方法当中基本上就是一些输入参数的校验和初始化的代码
   // 检查输入的参数最大与最小个数
   checkArgsSize(arguments, 2, 2);
	
  /**
   * 对输入的参数进行个数的校验
   * @param arguments
   * @param min
   * @param max
   * @throws UDFArgumentLengthException
   */
  protected void checkArgsSize(ObjectInspector[] arguments, int min, int max)
      throws UDFArgumentLengthException {
    if (arguments.length < min || arguments.length > max) {
      StringBuilder sb = new StringBuilder();
      sb.append(getFuncName());
      sb.append(" requires ");
      if (min == max) {
        sb.append(min);
      } else {
        sb.append(min).append("..").append(max);
      }
      sb.append(" argument(s), got ");
      sb.append(arguments.length);
      // 如果长度不对的话,就抛出长度异常
      throw new UDFArgumentLengthException(sb.toString());
    }
  }

    // 校验输入的参数类型是不是hive的原始数据类型
    checkArgPrimitive(arguments, 0);
    checkArgPrimitive(arguments, 1);
  /**
   * 校验是否是 PRIMITIVE 类型
   * PRIMITIVE 为一些hive原始数据类型 如 String
   * @param arguments
   * @param i
   * @throws UDFArgumentTypeException
   */
  protected void checkArgPrimitive(ObjectInspector[] arguments, int i)
      throws UDFArgumentTypeException {
    ObjectInspector.Category oiCat = arguments[i].getCategory();
    if (oiCat != ObjectInspector.Category.PRIMITIVE) {
      throw new UDFArgumentTypeException(i, getFuncName() + " only takes primitive types as "
          + getArgOrder(i) + " argument, got " + oiCat);
    }
  }


    // 对于输入的Timestamp 与 Date 获取类型转换器
    obtainTimestampConverter(arguments, 0, tsInputTypes, tsConverters);
  /**
   * 获取时间戳转换器
   *
   * @param arguments 输入数组
   * @param i 输入的数组下标
   * @param inputTypes 输入的类型数
   * @param converters 转换器
   * @throws UDFArgumentTypeException
   */
  protected void obtainTimestampConverter(ObjectInspector[] arguments, int i,
      PrimitiveCategory[] inputTypes, Converter[] converters) throws UDFArgumentTypeException {
    PrimitiveObjectInspector inOi = (PrimitiveObjectInspector) arguments[i];
    PrimitiveCategory inputType = inOi.getPrimitiveCategory();
    ObjectInspector outOi;

    // 对获取的类型进行一个校验
    switch (inputType) {
    case STRING:
    case VARCHAR:
    case CHAR:
    case TIMESTAMP:
    case DATE:
    case TIMESTAMPLOCALTZ:
      break;
    default:
      throw new UDFArgumentTypeException(i, getFuncName()
          + " only takes STRING_GROUP or DATE_GROUP types as " + getArgOrder(i) + " argument, got "
          + inputType);
    }
    outOi = PrimitiveObjectInspectorFactory.writableTimestampObjectInspector;
    // 把获取到的Timestamp类型转换其复制给converters 对象
    converters[i] = ObjectInspectorConverters.getConverter(inOi, outOi);
    inputTypes[i] = inputType;
  }
    obtainDateConverter(arguments, 0, dtInputTypes, dtConverters);
    
  1. evaluate(DeferredObject[] arguments) 核心的函数
	// 其中的核心就是使用java的 SimpleDateFormat 进行转换
    Timestamp ts = getTimestampValue(arguments, 0, tsConverters);
    if (ts == null) { // 如果输入的类型不是Timestamp 就是 Date了
      // 根据类型转换器 记牌器date
      Date d = getDateValue(arguments, 0, dtInputTypes, dtConverters);
      if (d == null) {
        return null;
      }
      // 把date 转换成 ts
      ts = Timestamp.ofEpochMilli(d.toEpochMilli());
    }
    // 赋值给 date
    date.setTime(ts.toEpochMilli());
    // 进行格式化
    String res = formatter.format(date);

总结

  1. 其实核心代码比较简单,大量的都是再数据类型的校验与转换上。
  2. 但是从读代码的角度上看,发现了源码处处都在做校验,这也是源码再代码质量上做了很多。