hive 函数
date_format ()
用法
select weekofyear('2022-01-01 12:20:20','yyyy-MM-dd')
返回值
2022-01-01
函数含义
对输入的日期进行格式化
源码地址
org.apache.hadoop.hive.ql.udf.generic.GenericUDFDateFormat
源代码
@Description(name = "date_format", value = "_FUNC_(date/timestamp/string, fmt) - converts a date/timestamp/string "
+ "to a value of string in the format specified by the date format fmt.",
extended = "Supported formats are SimpleDateFormat formats - "
+ "https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html. "
+ "Second argument fmt should be constant.\n"
+ "Example: > SELECT _FUNC_('2015-04-08', 'y');\n '2015'")
public class GenericUDFDateFormat extends GenericUDF {
// 一个对象转换器
private transient Converter[] tsConverters = new Converter[2];
// 原始类型类别
private transient PrimitiveCategory[] tsInputTypes = new PrimitiveCategory[2];
private transient Converter[] dtConverters = new Converter[2];
private transient PrimitiveCategory[] dtInputTypes = new PrimitiveCategory[2];
private final java.util.Date date = new java.util.Date();
// 使用hadoop Text 用于输出
private final Text output = new Text();
private transient SimpleDateFormat formatter;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
// 检查输入的参数最大与最小个数
checkArgsSize(arguments, 2, 2);
// 校验输入的参数类型是不是hive的原始数据类型
checkArgPrimitive(arguments, 0);
checkArgPrimitive(arguments, 1);
// the function should support both short date and full timestamp format
// time part of the timestamp should not be skipped
// 校验数据类型,并保存到 tsInputTypes 、 tsInputTypes 、tsInputTypes 当中
// 输入类型必须是 string date 输出类型为 string
checkArgGroups(arguments, 0, tsInputTypes, STRING_GROUP, DATE_GROUP);
checkArgGroups(arguments, 0, dtInputTypes, STRING_GROUP, DATE_GROUP);
checkArgGroups(arguments, 1, tsInputTypes, STRING_GROUP);
// 对于输入的Timestamp 与 Date 获取类型转换器
obtainTimestampConverter(arguments, 0, tsInputTypes, tsConverters);
obtainDateConverter(arguments, 0, dtInputTypes, dtConverters);
if (arguments[1] instanceof ConstantObjectInspector) {
// 获取date_format() 函数的 要解析的格式
String fmtStr = getConstantStringValue(arguments, 1);
if (fmtStr != null) {
try {
// 使用java 的 simpleDateFormat 对输入的格式进行转换并设置时区为UTC
formatter = new SimpleDateFormat(fmtStr);
formatter.setTimeZone(TimeZone.getTimeZone("UTC"));
} catch (IllegalArgumentException e) {
// ignore
}
}
} else {
// 当fmtStr 为null 的 时候直接抛出异常
throw new UDFArgumentTypeException(1, getFuncName() + " only takes constant as "
+ getArgOrder(1) + " argument");
}
ObjectInspector outputOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
return outputOI;
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
if (formatter == null) {
return null;
}
// the function should support both short date and full timestamp format
// time part of the timestamp should not be skipped
// 把输入的第一个参数与类型转换器 获取到 Timestamp
Timestamp ts = getTimestampValue(arguments, 0, tsConverters);
if (ts == null) { // 如果输入的类型不是Timestamp 就是 Date了
// 根据类型转换器 记牌器date
Date d = getDateValue(arguments, 0, dtInputTypes, dtConverters);
if (d == null) {
return null;
}
// 把date 转换成 ts
ts = Timestamp.ofEpochMilli(d.toEpochMilli());
}
// 赋值给 date
date.setTime(ts.toEpochMilli());
// 进行格式化
String res = formatter.format(date);
if (res == null) {
return null;
}
// 赋值给 output
output.set(res);
return output;
}
@Override
public String getDisplayString(String[] children) {
return getStandardDisplayString(getFuncName(), children);
}
@Override
protected String getFuncName() {
return "date_format";
}
}
源码分析
- initialize(ObjectInspector[] arguments) 方法当中基本上就是一些输入参数的校验和初始化的代码
// 检查输入的参数最大与最小个数
checkArgsSize(arguments, 2, 2);
/**
* 对输入的参数进行个数的校验
* @param arguments
* @param min
* @param max
* @throws UDFArgumentLengthException
*/
protected void checkArgsSize(ObjectInspector[] arguments, int min, int max)
throws UDFArgumentLengthException {
if (arguments.length < min || arguments.length > max) {
StringBuilder sb = new StringBuilder();
sb.append(getFuncName());
sb.append(" requires ");
if (min == max) {
sb.append(min);
} else {
sb.append(min).append("..").append(max);
}
sb.append(" argument(s), got ");
sb.append(arguments.length);
// 如果长度不对的话,就抛出长度异常
throw new UDFArgumentLengthException(sb.toString());
}
}
// 校验输入的参数类型是不是hive的原始数据类型
checkArgPrimitive(arguments, 0);
checkArgPrimitive(arguments, 1);
/**
* 校验是否是 PRIMITIVE 类型
* PRIMITIVE 为一些hive原始数据类型 如 String
* @param arguments
* @param i
* @throws UDFArgumentTypeException
*/
protected void checkArgPrimitive(ObjectInspector[] arguments, int i)
throws UDFArgumentTypeException {
ObjectInspector.Category oiCat = arguments[i].getCategory();
if (oiCat != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentTypeException(i, getFuncName() + " only takes primitive types as "
+ getArgOrder(i) + " argument, got " + oiCat);
}
}
// 对于输入的Timestamp 与 Date 获取类型转换器
obtainTimestampConverter(arguments, 0, tsInputTypes, tsConverters);
/**
* 获取时间戳转换器
*
* @param arguments 输入数组
* @param i 输入的数组下标
* @param inputTypes 输入的类型数
* @param converters 转换器
* @throws UDFArgumentTypeException
*/
protected void obtainTimestampConverter(ObjectInspector[] arguments, int i,
PrimitiveCategory[] inputTypes, Converter[] converters) throws UDFArgumentTypeException {
PrimitiveObjectInspector inOi = (PrimitiveObjectInspector) arguments[i];
PrimitiveCategory inputType = inOi.getPrimitiveCategory();
ObjectInspector outOi;
// 对获取的类型进行一个校验
switch (inputType) {
case STRING:
case VARCHAR:
case CHAR:
case TIMESTAMP:
case DATE:
case TIMESTAMPLOCALTZ:
break;
default:
throw new UDFArgumentTypeException(i, getFuncName()
+ " only takes STRING_GROUP or DATE_GROUP types as " + getArgOrder(i) + " argument, got "
+ inputType);
}
outOi = PrimitiveObjectInspectorFactory.writableTimestampObjectInspector;
// 把获取到的Timestamp类型转换其复制给converters 对象
converters[i] = ObjectInspectorConverters.getConverter(inOi, outOi);
inputTypes[i] = inputType;
}
obtainDateConverter(arguments, 0, dtInputTypes, dtConverters);
- evaluate(DeferredObject[] arguments) 核心的函数
// 其中的核心就是使用java的 SimpleDateFormat 进行转换
Timestamp ts = getTimestampValue(arguments, 0, tsConverters);
if (ts == null) { // 如果输入的类型不是Timestamp 就是 Date了
// 根据类型转换器 记牌器date
Date d = getDateValue(arguments, 0, dtInputTypes, dtConverters);
if (d == null) {
return null;
}
// 把date 转换成 ts
ts = Timestamp.ofEpochMilli(d.toEpochMilli());
}
// 赋值给 date
date.setTime(ts.toEpochMilli());
// 进行格式化
String res = formatter.format(date);
总结
- 其实核心代码比较简单,大量的都是再数据类型的校验与转换上。
- 但是从读代码的角度上看,发现了源码处处都在做校验,这也是源码再代码质量上做了很多。