Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions develop/dev.env
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ WEBPACK_HOST=webpack
WEB_API_PASSWORD=overleaf
WEB_API_USER=overleaf
WEB_HOST=web
ADMIN_PRIVILEGE_AVAILABLE=true
V1_HISTORY_URL=http://history-v1:3100/api
OT_JWT_AUTH_KEY="very secret key"
OVERLEAF_PROXY_LEARN=true
2 changes: 2 additions & 0 deletions server-ce/config/settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ const settings = {
clsiCacheDir: Path.join(DATA_DIR, 'cache'),
// Where to write the output files to disk after running LaTeX
outputDir: Path.join(DATA_DIR, 'output'),
// Where to cache learn pages
learnPagesFolder: Path.join(DATA_DIR, 'learnPages'),
},

// Server Config
Expand Down
3 changes: 3 additions & 0 deletions server-ce/init_scripts/100_make_overleaf_data_dirs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ chown www-data:www-data /var/lib/overleaf/data/template_files
mkdir -p /var/lib/overleaf/data/history
chown www-data:www-data /var/lib/overleaf/data/history

mkdir -p /var/lib/overleaf/data/learnPages
chown www-data:www-data /var/lib/overleaf/data/learnPages

mkdir -p /var/lib/overleaf/tmp/projectHistories
chown www-data:www-data /var/lib/overleaf/tmp/projectHistories

Expand Down
2 changes: 2 additions & 0 deletions services/web/config/settings.defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,7 @@ module.exports = {
// them to disk here).
dumpFolder: Path.resolve(__dirname, '../data/dumpFolder'),
uploadFolder: Path.resolve(__dirname, '../data/uploads'),
learnPagesFolder: Path.resolve(__dirname, '../data/learnPages'),
},

// Automatic Snapshots
Expand Down Expand Up @@ -1062,6 +1063,7 @@ module.exports = {
moduleImportSequence: [
'history-v1',
'launchpad',
'learn',
'server-ce-scripts',
'user-activate',
'sandboxed-compiles',
Expand Down
110 changes: 110 additions & 0 deletions services/web/modules/learn/app/src/LearnProxy.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import sanitizeHtml from 'sanitize-html'
import Settings from '@overleaf/settings'
import { sanitizeOptions } from './sanitizeOptions.mjs'
import fs from 'node:fs'
import logger from '@overleaf/logger'
import Path from 'node:path'
import { expressify } from '@overleaf/promise-utils'
import scrape from './scrape.mjs'
const { scrapeAndCachePage } = scrape


// Check if the filePath are older than maxCacheAge
// Based on Settings.apis.wiki.maxCacheAgeer
// If older, re-fetch and update the cache
async function checkFileCache(learnPagesFolder, pageName) {
const path = Path.join(learnPagesFolder, encodeURIComponent(pageName) + '.json')
// Check if file exists
let stat = null
let now = Date.now()
let mtime = 0
try {
stat = await fs.promises.stat(path)
mtime = stat.mtime.getTime()
} catch (e) {
logger.error({ err: e }, `error stating cached page file: ${path}`)
}


// If the cache is older than maxCacheAge, refresh it
if (stat === null || now - mtime > Settings.apis.wiki.maxCacheAge) {
logger.debug({
now: now,
mtime: mtime,
maxCacheAge: Settings.apis.wiki.maxCacheAge
}, `out of date cache detected for file: ${path}`)

const BASE_URL = Settings.apis.wiki.url

try {
await fs.promises.unlink(path)
logger.debug({}, `deleted cached page file to force re-fetching: ${path}`)
} catch (e) {
logger.error({ err: e }, `error deleting cached page file: ${path}`)
}
await scrapeAndCachePage(BASE_URL, pageName)
}

}

async function learnPage(req, res) {
let reqPath = req.path
// Trim leading '/', only show the path after '/'
if (reqPath.startsWith('/')) {
reqPath = reqPath.slice(1)
} else {
res.status(400).send('Bad Request')
return
}
let learnPath = reqPath

if (learnPath === '') {
logger.debug({}, 'Learn proxy requested root path, redirecting to Main/Page')
learnPath = 'Main Page'
}

// Encode the path for file system usage
learnPath = encodeURIComponent(decodeURIComponent(learnPath.replace(/_/g, ' ')))
logger.debug({}, `Learn proxy requested path: ${learnPath}`)

// Contents.json Should be sidebarHtml
let contentsFilePath = Path.resolve(Settings.path.learnPagesFolder, `Contents.json`)

// If Contents.json does not exist, return 500
if (!fs.existsSync(contentsFilePath)) {
await checkFileCache(Settings.path.learnPagesFolder, 'Contents')
return
}

await checkFileCache(Settings.path.learnPagesFolder, 'Contents')
const raw = await fs.promises.readFile(contentsFilePath, 'utf-8')
const json = JSON.parse(raw)
const sidebarHtml = json.text['*']

let pageFilePath = Path.resolve(Settings.path.learnPagesFolder, `${learnPath}.json`)
// If the page does not exist, fallback to "Learn LaTeX in 30 minutes"
if (!fs.existsSync(pageFilePath)) {
learnPath = 'Learn%20LaTeX%20in%2030%20minutes'
pageFilePath = Path.resolve(Settings.path.learnPagesFolder, `${learnPath}.json`)
}

await checkFileCache(Settings.path.learnPagesFolder, decodeURIComponent(learnPath))
const pageRaw = await fs.promises.readFile(pageFilePath, 'utf-8')
const pageJson = JSON.parse(pageRaw)
const pageTitle = pageJson.title
const pageHtml = pageJson.text['*']

res.render(Path.resolve(import.meta.dirname, '../views/learn'), {
sidebarHtml: sanitizeHtml(sidebarHtml, sanitizeOptions),
pageTitle: pageTitle,
pageHtml: sanitizeHtml(pageHtml, sanitizeOptions),
})


}

const LearnProxyController = {
learnPage: expressify(learnPage),
}

export default LearnProxyController
16 changes: 16 additions & 0 deletions services/web/modules/learn/app/src/LearnRouter.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import logger from '@overleaf/logger'
import Settings from '@overleaf/settings'
import LearnProxyController from './LearnProxy.mjs'
import AuthenticationController from '../../../../app/src/Features/Authentication/AuthenticationController.mjs'

export default {
apply(webRouter) {
if (!Settings.proxyLearn) {
logger.debug({}, 'Learn proxy disabled via Settings.proxyLearn')
return
}

webRouter.get('/learn', LearnProxyController.learnPage)
webRouter.use('/learn/latex', LearnProxyController.learnPage)
},
}
118 changes: 118 additions & 0 deletions services/web/modules/learn/app/src/checkSanitizeOptions.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import crypto from 'node:crypto'
import fs from 'node:fs'
import Path from 'node:path'
import cheerio from 'cheerio'
// checkSanitizeOptions is only used in dev env
// eslint-disable-next-line import/no-extraneous-dependencies
import * as prettier from 'prettier'
import sanitizeHtml from 'sanitize-html'
import { sanitizeOptions } from './sanitizeOptions.mjs'
import { fileURLToPath } from 'node:url'

const __dirname = Path.dirname(fileURLToPath(import.meta.url))
const EXTRACT_STYLE = process.env.EXTRACT_STYLES === 'true'
const OMIT_STYLE = process.env.OMIT_STYLE !== 'false'
const DUMP_CSS_IN = Path.join(
Path.dirname(Path.dirname(Path.dirname(__dirname))),
'data',
'dumpFolder'
)

function hash(blob) {
return crypto.createHash('sha1').update(blob).digest('hex')
}

function normalize(blob, title) {
// styles are dropped in web and kept in wiki pages for previewing there.
blob = blob.replace(/<style>(.+?)<\/style>/gs, (_, match) => {
if (EXTRACT_STYLE) {
// normalize css with prettier
const css = prettier.format(match, { parser: 'css' })
fs.writeFileSync(
Path.join(DUMP_CSS_IN, `${hash(css)}-${encodeURIComponent(title)}.css`),
`/* title: ${title} */\n\n${css}`
)
}
if (OMIT_STYLE) {
return ''
}
return match
})

// strip comments:
// - comment at the bottom of each page
blob = blob.replace(/<!-- \nNewPP limit report.+/s, '')
// - annotation of math characters
blob = blob.replace(/<!-- . -->/g, '')

// wrap for consistent rendering
if (blob.indexOf('<html><head>') !== 0) {
blob = `<html><head>${blob}</head></html>`
}

// normalize inline style:
// - drop trailing ;
blob = blob.replace(/style="([^"]+);"/g, (_, style) => `style="${style}"`)
// - normalize whitespace
blob = blob.replace(
/style="([^"]+)"/g,
(_, style) => `style="${style.trim().replace(/([:;])\s+/g, '$1')}"`
)

// let cherrio do another pass
return cheerio.load(blob).html()
}

function toText(blob) {
return cheerio.load(blob).text()
}

const zoomOut = 50
function peak(content, offset) {
// show some more content before/after the mismatch
if (offset > zoomOut) {
offset -= zoomOut
}
// wrap in JSON to escape new line characters
return JSON.stringify(content.slice(offset, offset + chunkSize + 2 * zoomOut))
}

const chunkSize = 100
function findFirstMismatch(a, b) {
if (a === b) return a.length
let i = 0
while (
a.length > chunkSize &&
b.length > chunkSize &&
a.slice(0, chunkSize) === b.slice(0, chunkSize)
) {
i++
a = a.slice(chunkSize)
b = b.slice(chunkSize)
}
return i * chunkSize
}

function checkSanitizeOptions(page, title, text) {
text = normalize(text, title)
const sanitized = normalize(sanitizeHtml(text, sanitizeOptions))
if (text === sanitized) return

const offset = findFirstMismatch(text, sanitized)

const textToText = toText(text)
const sanitizedToText = toText(sanitized)
const offsetText = findFirstMismatch(textToText, sanitizedToText)

console.error('---')
console.error('page :', page)
console.error('title :', title)
console.error('match :', text === sanitized)
console.error('toText :', toText(text) === toText(sanitized))
console.error('text :', peak(text, offset))
console.error('sanitized :', peak(sanitized, offset))
console.error('textToText :', peak(textToText, offsetText))
console.error('sanitizedToText:', peak(sanitizedToText, offsetText))
}

export default checkSanitizeOptions
Loading