在kettle里调用java代码出错
public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
if (first) {
first = false;
/* TODO: Your code here. (Using info fields)
FieldHelper infoField = get(Fields.Info, "info_field_name");
RowSet infoStream = findInfoRowSet("info_stream_tag");
Object[] infoRow = null;
int infoRowCount = 0;
// Read all rows from info step before calling getRow() method, which returns first row from any
// input rowset. As rowMeta for info and input steps varies getRow() can lead to errors.
while((infoRow = getRowFrom(infoStream)) != null){
// do something with info data
infoRowCount++;
}
*/
}
Object[] r = getRow();
if (r == null) {
setOutputDone();
return false;
}
// It is always safest to call createOutputRow() to ensure that your output row's Object[] is large
// enough to handle any new fields you are creating in this step.
r = createOutputRow(r, data.outputRowMeta.size());
/* TODO: Your code here. (See Sample)
// Get the value from an input field
String foobar = get(Fields.In, "a_fieldname").getString(r);
foobar += "bar";
// Set a value in a new output field
get(Fields.Out, "output_fieldname").setValue(r, foobar);
*/
// Send the row on to the next step.
putRow(data.outputRowMeta, r);
return true;
}
需要在上述kettle控件里自带的代码里插入下面的清洗需求代码
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class regex_address {
/**
* @Desc regexAddress()函数,传入地址,以List存储省市县
* @param address
* @return List<Map<String,String>>
*/
public static List<Map<String,String>> regexAddress(String address){
// 自定义正则表达式
// 由于后续通过province分组无法获取到四个直辖市,所以在后续代码中再次处理,这里只需要匹配成功即可
String regex = "((?<province>[^省]+省|.+自治区|.*?行政区)|上海市|北京市|天津市|重庆市)(?<city>[^市]+自治州|.*?地区|.*?行政单位|.+盟|市辖区|.*?市)?(?<country>[^县]+县|.+市|.+区|.+旗|.+海域|.+岛|)?(?<town>[^区]+区|.+镇|.+街|.+街道|.+路)?(?<village>[^村]+村|.+大道)?(?<other>.*)";
// 使用regex正则表达式匹配address地址
Matcher matcher = Pattern.compile(regex).matcher(address);
// 初始化省市县
String province = null,city = null,country = null;
// 定义ArrayList<>()
ArrayList<Map<String, String>> table = new ArrayList<Map<String,String>>();
Map<String,String> row = null;
//匹配成功的情况
while (matcher.find()){
row = new LinkedHashMap<String,String>();
//自定义正则表达式2,用于匹配四个直辖市
String regex2 = "(上海市|北京市|天津市|重庆市)";
Matcher matcher1 = Pattern.compile(regex2).matcher(address);
//如果匹配成功,则将直辖市复制给province变量
if (matcher1.find())
province = matcher1.group(1);
else // 否则直接通过matcher从regex中根据分组province匹配获得
String province = matcher.group("province");
row.put("province",province == null ? "":province.trim());
// 匹配city
String city = matcher.group("city");
row.put("city",city == null ? "":city.trim());
// 匹配country
String country = matcher.group("country");
row.put("country",country == null ? "":country.trim());
// 匹配town
String town = matcher.group("town");
row.put("town",town == null ? "":town.trim());
// 匹配village
String village = matcher.group("village");
row.put("village",village == null ? "":village.trim());
// 匹配other
String other = matcher.group("other");
row.put("other",other == null ? "":other.trim());
table.add(row);
}
return table;
}
}
String str = get(Fields.In, "地址").getString(r);
public static void main(String[] args) {
String[] addressList = str;
for(String address : addressList){
System.out.println(address);
List<Map<String, String>> address1 = regexAddress(address);
String province = address1.get(0).get("province");
String city = address1.get(0).get("city");
String country = address1.get(0).get("country");
String town= address1.get(0).get("town");
String village = address1.get(0).get("village");
String other = address1.get(0).get("other");
}
}
我对java不太熟悉,插入后总是各种报错,求各位开发者提供正确的代码,kettle的输出语句为:get(Fields.Out, "output_fieldname").setValue(r, foobar);
好人一生平安
试试这个
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
Object[] r = getRow();
if (r == null) {
setOutputDone();
return false;
}
if (first) {
first = false;
}
String address = get(Fields.In, "地址").getString(r);
List<Map<String, String>> addressList = regexAddress(address);
Object[] outPutRows = createOutputRow(r, addressList.size());
for (Map<String, String> map : addressList) {
String province = (String) map.get("province");
String city = (String) map.get("city");
String country = (String) map.get("country");
String town= (String) map.get("town");
String village = (String) map.get("village");
String other = (String) map.get("other");
get(Fields.Out, "province").setValue(outPutRows, province);
get(Fields.Out, "city").setValue(outPutRows, city);
get(Fields.Out, "country").setValue(outPutRows, country);
get(Fields.Out, "town").setValue(outPutRows, town);
get(Fields.Out, "village").setValue(outPutRows, village);
get(Fields.Out, "other").setValue(outPutRows, other);
}
// Send the row on to the next step.
putRow(data.outputRowMeta, outPutRows);
return true;
}
/**
* @param address
* @return List<Map < String, String>>
* @Desc regexAddress()函数,传入地址,以List存储省市县
*/
public static List<Map<String, String>> regexAddress(String address) {
// 自定义正则表达式
// 由于后续通过province分组无法获取到四个直辖市,所以在后续代码中再次处理,这里只需要匹配成功即可
String regex = "((?<province>[^省]+省|.+自治区|.*?行政区)|上海市|北京市|天津市|重庆市)(?<city>[^市]+自治州|.*?地区|.*?行政单位|.+盟|市辖区|.*?市)?(?<country>[^县]+县|.+市|.+区|.+旗|.+海域|.+岛|)?(?<town>[^区]+区|.+镇|.+街|.+街道|.+路)?(?<village>[^村]+村|.+大道)?(?<other>.*)";
// 使用regex正则表达式匹配address地址
Matcher matcher = Pattern.compile(regex).matcher(address);
// 初始化省市县
String province = null, city = null, country = null;
// 定义ArrayList<>()
ArrayList<Map<String, String>> table = new ArrayList<Map<String, String>>();
Map<String, String> row = null;
//匹配成功的情况
while (matcher.find()) {
row = new LinkedHashMap<String, String>();
//自定义正则表达式2,用于匹配四个直辖市
String regex2 = "(上海市|北京市|天津市|重庆市)";
Matcher matcher1 = Pattern.compile(regex2).matcher(address);
//如果匹配成功,则将直辖市复制给province变量
if (matcher1.find())
province = matcher1.group(1);
else // 否则直接通过matcher从regex中根据分组province匹配获得
province = matcher.group("province");
row.put("province", province == null ? "" : province.trim());
// 匹配city
city = matcher.group("city");
row.put("city", city == null ? "" : city.trim());
// 匹配country
country = matcher.group("country");
row.put("country", country == null ? "" : country.trim());
// 匹配town
String town = matcher.group("town");
row.put("town", town == null ? "" : town.trim());
// 匹配village
String village = matcher.group("village");
row.put("village", village == null ? "" : village.trim());
// 匹配other
String other = matcher.group("other");
row.put("other", other == null ? "" : other.trim());
table.add(row);
}
return table;
}