freeCodeCamp/search-indexing/utils/index.js

106 lines
2.7 KiB
JavaScript

const { Observable } = require('rxjs');
const fse = require('fs-extra');
const file = require('file');
const _ = require('lodash');
const { isURL } = require('validator');
const stripTags = require('striptags');
const Entities = require('html-entities').AllHtmlEntities;
const entities = new Entities();
const isAFileRE = /(\.md|\.jsx?|\.html?)$/;
const isJSRE = /\.jsx?$/;
const shouldBeIgnoredRE = /^(\_|\.)/;
const excludedDirs = ['search'];
const guideSvnRE = /guides\/svn$/;
exports.isAFileRE = isAFileRE;
exports.isJSRE = isJSRE;
exports.shouldBeIgnoredRE = shouldBeIgnoredRE;
exports.excludedDirs = excludedDirs;
/*
* *
* Directory Helpers *
* *
*/
exports.listDirectory = function listDirectory(start) {
let allDirs = [];
file.walkSync(start, dirPath => {
if (dirPath.includes('.svn')) {
return;
}
allDirs = [...allDirs, dirPath];
});
return allDirs.filter(name => !guideSvnRE.test(name));
};
function readDir(dir = __dirname, returnFiles = false) {
const dirContent = fse
.readdirSync(dir)
.filter(dir => !excludedDirs.includes(dir))
.filter(file => !(shouldBeIgnoredRE.test(file) || isJSRE.test(file)))
.filter(file => file !== 'LICENSE.md');
return returnFiles
? dirContent
: dirContent.filter(item => !isAFileRE.test(item));
}
exports.readDir = readDir;
exports.parseDirectory = function parseDirectory(dirLevel, cb) {
return Observable.from(readDir(dirLevel)).flatMap(dir => {
const dirPath = `${dirLevel}/${dir}`;
const subDirs = readDir(dirPath);
if (!subDirs) {
cb(dirPath);
return Observable.of(null);
}
cb(dirPath);
return parseDirectory(dirPath, cb);
});
};
/*
* *
* Document Helpers *
* *
*/
exports.chunkDocument = function chunkDocument(doc, pickFields, chunkField) {
const baseDoc = _.pick(doc, pickFields);
const chunks = doc[chunkField].match(/(?:[\n\s]+[\S]+){1,200}/g);
if (!chunks) {
return [doc];
}
return chunks.map(chunk => ({ ...baseDoc, [chunkField]: chunk }));
};
function stripURLs(str) {
return str
.split(/\s/)
.filter(subStr => !_.isEmpty(subStr))
.filter(subStr => !isURL(subStr))
.join(' ');
}
function fixEntities(str) {
let newStr = str.slice(0);
function entitiesFixer(match) {
const tmpArr = match.split('');
const fixed =
tmpArr.slice(0, -1).join('') + ';'.concat(tmpArr[tmpArr.length - 1]);
newStr = newStr.split(match).join(fixed);
}
str.replace(/&#\d\d[^(!?;)]/g, entitiesFixer);
return newStr;
}
exports.stripURLs = stripURLs;
exports.stripHTML = function stripHTML(text) {
const unescapedStr = entities.decode(fixEntities(text));
return stripTags(unescapedStr);
};