From d6d44df3321a27be9c4e317066476f3390085aec Mon Sep 17 00:00:00 2001
From: m bobak <mbobak@illinois.edu>
Date: Fri, 14 Oct 2022 07:49:24 -0700
Subject: [PATCH 1/5] vs using sitemap.xml fork branch

---
 app/controllers/Application.scala | 46 +++++++++++++++++++++++++++++++
 conf/routes                       |  6 ++++
 conf/sitemap.xml                  |  4 +++
 3 files changed, 56 insertions(+)
 create mode 100644 conf/sitemap.xml
diff --git a/app/controllers/Application.scala b/app/controllers/Application.scala
index 344474f35..c3dab2d6e 100644
--- a/app/controllers/Application.scala
+++ b/app/controllers/Application.scala
@@ -4,6 +4,7 @@ import models.{Event, UUID, UserStatus}
 import play.api.Play.current
 import play.api.mvc.Action
 import play.api.{Logger, Play, Routes}
+import play.api.libs.json._
 import services._
 import util.Formatters.sanitizeHTML
 
@@ -84,6 +85,51 @@ class Application @Inject()(files: FileService, collections: CollectionService,
     }
   }
 
+
+  /**
+   * Returns the sitemap.xml for the datasets to be scraped for their jsonld scripts
+   * suggested to start like w/swagger route, but if I don't cache it, then I should change this
+   *  otherwise it will need a filler file there; which I should provide as a cache
+   */
+  def sitemap = Action { implicit request =>
+    Play.resource("/public/sitemap.xml") match { //in case we cache it here someday
+      case Some(resource) => {
+        val https = Utils.https(request)
+        val clowderurl = new URL(Utils.baseUrl(request))
+        val host = if (clowderurl.getPort == -1) {
+          clowderurl.getHost
+        } else {
+          clowderurl.getHost + ":" + clowderurl.getPort
+        }
+        var resultStr=""
+        val top= """<?xml version="1.0" encoding="UTF-8"?>
+            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> """
+        resultStr = resultStr.concat(top)
+        //though had called the route2get but couldn't change datastruct
+        val d = scala.io.Source.fromURL(clowderurl + "/api/datasets")
+        val sd = d.mkString
+        val parsedJson = Json.parse(sd)
+        val idl = (parsedJson \\ "id")
+        val id1=idl(1)
+        var uStr = ""
+        idl.foreach( id => {
+           val id_ = id.as[String]
+           uStr = "\n<url><loc>" + clowderurl + "/datasets/" + id_ + "</loc></url>"
+           resultStr = resultStr.concat(uStr)
+        })
+        resultStr = resultStr +  "\n</urlset>"
+        //could cache, in case we want to reuse later, w/Ok(reult.mkString)
+        //_would again check cache before creating, but still problems w/:
+        //BufferedWriter writer = new BufferedWriter(new FileWriter(resource));
+        //writer.write(resultStr); writer.close(); //getting errors again w/this
+        //val resultStr = "ret string vs file"
+        Ok(resultStr.mkString)
+      }
+      case None => NotFound("Could not find sitemap.xml")
+    }
+  }
+
+
   /**
    * Main page.
    */
diff --git a/conf/routes b/conf/routes
index 0eb5f62b6..4cd844301 100644
--- a/conf/routes
+++ b/conf/routes
@@ -297,6 +297,12 @@ GET            /javascriptRoutes
 # ----------------------------------------------------------------------
 GET            /swagger                                                                 @controllers.Application.swagger
 GET            /swaggerUI                                                               @controllers.Application.swaggerUI
+ 
+# ----------------------------------------------------------------------
+# SITEMAP
+# ----------------------------------------------------------------------
+GET            /sitemap.xml                                                             @controllers.Application.sitemap
+GET            /sitemap                                                                 @controllers.Application.sitemap
 
 # ----------------------------------------------------------------------
 # RESTful API
diff --git a/conf/sitemap.xml b/conf/sitemap.xml
new file mode 100644
index 000000000..a9a734872
--- /dev/null
+++ b/conf/sitemap.xml
@@ -0,0 +1,4 @@
+=placeholder right now:
+Route setup to read from this cached file, so expects it
+ even though the caching hasn't been done yet
+ and right now it is returning it directly

From 44a57e106e5480890aa08bbc47190eff93cc8269 Mon Sep 17 00:00:00 2001
From: m bobak <mbobak@illinois.edu>
Date: Fri, 14 Oct 2022 07:51:49 -0700
Subject: [PATCH 2/5] inital sitemap route

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 85bd19a86..0507df0b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ script prior to upgrading to minimize the downtime.
 - In the docker container the folder /home/clowder/data is now whitelisted by default for uploading by reference. 
   This can be changed using the environment variable CLOWDER_SOURCEPATH.
 - The current CLA for developers of clowder.
+- sitemap.xml route to list dataset pages so they can be crawled for thier embedded jsonld, for google dataset search
 
 ### Fixed
 - Send email to all admins in a single email when a user submits 'Request access' for a space

From 735b0b1b8ea5189dde677c3dab9005a7ecb2a49d Mon Sep 17 00:00:00 2001
From: mike bobak <MBcode@users.noreply.github.com>
Date: Fri, 21 Oct 2022 10:50:30 -0500
Subject: [PATCH 3/5] Update Application.scala

getDatasets for User.anonymous for now, vs using the route
can look at possible caching and changing list limit in next iteration
---
 app/controllers/Application.scala | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/app/controllers/Application.scala b/app/controllers/Application.scala
index c3dab2d6e..fe20f34b3 100644
--- a/app/controllers/Application.scala
+++ b/app/controllers/Application.scala
@@ -1,13 +1,13 @@
 package controllers
 
-import models.{Event, UUID, UserStatus}
+import models.{Event, UUID, UserStatus, User}
 import play.api.Play.current
 import play.api.mvc.Action
 import play.api.{Logger, Play, Routes}
 import play.api.libs.json._
 import services._
 import util.Formatters.sanitizeHTML
-
+import api.Permission.Permission 
 import java.net.URL
 import javax.inject.{Inject, Singleton}
 import scala.collection.immutable.List
@@ -89,7 +89,7 @@ class Application @Inject()(files: FileService, collections: CollectionService,
   /**
    * Returns the sitemap.xml for the datasets to be scraped for their jsonld scripts
    * suggested to start like w/swagger route, but if I don't cache it, then I should change this
-   *  otherwise it will need a filler file there; which I should provide as a cache
+   *  otherwise it will need a filler file there; which I  provide till used as a cache
    */
   def sitemap = Action { implicit request =>
     Play.resource("/public/sitemap.xml") match { //in case we cache it here someday
@@ -101,28 +101,20 @@ class Application @Inject()(files: FileService, collections: CollectionService,
         } else {
           clowderurl.getHost + ":" + clowderurl.getPort
         }
+        val user = User.anonymous 
+        val dd = tree.getDatasets(false,user)
         var resultStr=""
         val top= """<?xml version="1.0" encoding="UTF-8"?>
             <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> """
         resultStr = resultStr.concat(top)
-        //though had called the route2get but couldn't change datastruct
-        val d = scala.io.Source.fromURL(clowderurl + "/api/datasets")
-        val sd = d.mkString
-        val parsedJson = Json.parse(sd)
-        val idl = (parsedJson \\ "id")
-        val id1=idl(1)
         var uStr = ""
-        idl.foreach( id => {
-           val id_ = id.as[String]
-           uStr = "\n<url><loc>" + clowderurl + "/datasets/" + id_ + "</loc></url>"
+        dd.foreach( dd_ => {
+           val dd_id = (dd_ \ "id").as[String]
+           uStr = "\n<url><loc>" + clowderurl + "/datasets/" + dd_id + "</loc></url>"
            resultStr = resultStr.concat(uStr)
         })
         resultStr = resultStr +  "\n</urlset>"
-        //could cache, in case we want to reuse later, w/Ok(reult.mkString)
-        //_would again check cache before creating, but still problems w/:
-        //BufferedWriter writer = new BufferedWriter(new FileWriter(resource));
-        //writer.write(resultStr); writer.close(); //getting errors again w/this
-        //val resultStr = "ret string vs file"
+        //could still cache and read when nothing new but less likely if have to recheck permissions as well
         Ok(resultStr.mkString)
       }
       case None => NotFound("Could not find sitemap.xml")

From bd9e344274f8346e7882cb0ba496cdaf7847c3e5 Mon Sep 17 00:00:00 2001
From: mike bobak <MBcode@users.noreply.github.com>
Date: Thu, 27 Oct 2022 16:35:01 -0500
Subject: [PATCH 4/5] Update Application.scala

from remote sitemap.xml branch
---
 app/controllers/Application.scala | 34 +++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/app/controllers/Application.scala b/app/controllers/Application.scala
index fe20f34b3..b61c0cb57 100644
--- a/app/controllers/Application.scala
+++ b/app/controllers/Application.scala
@@ -1,13 +1,12 @@
 package controllers
 
-import models.{Event, UUID, UserStatus, User}
+import models.{Event, UUID, UserStatus}
 import play.api.Play.current
 import play.api.mvc.Action
 import play.api.{Logger, Play, Routes}
-import play.api.libs.json._
 import services._
 import util.Formatters.sanitizeHTML
-import api.Permission.Permission 
+
 import java.net.URL
 import javax.inject.{Inject, Singleton}
 import scala.collection.immutable.List
@@ -19,7 +18,8 @@ import scala.collection.mutable.ListBuffer
 @Singleton
 class Application @Inject()(files: FileService, collections: CollectionService, datasets: DatasetService,
                             spaces: SpaceService, events: EventService, comments: CommentService,
-                            sections: SectionService, users: UserService, selections: SelectionService) extends SecuredController {
+                            sections: SectionService, users: UserService, selections: SelectionService,
+                            tree: TreeService) extends SecuredController {
   /**
    * Redirect any url's that have a trailing /
    *
@@ -85,12 +85,15 @@ class Application @Inject()(files: FileService, collections: CollectionService,
     }
   }
 
-
   /**
    * Returns the sitemap.xml for the datasets to be scraped for their jsonld scripts
    * suggested to start like w/swagger route, but if I don't cache it, then I should change this
-   *  otherwise it will need a filler file there; which I  provide till used as a cache
+   *  otherwise it will need a filler file there; which I should provide as a cache
    */
+import play.api.libs.json._  //put at top
+import api.Permission.Permission //put at top
+import models.User
+
   def sitemap = Action { implicit request =>
     Play.resource("/public/sitemap.xml") match { //in case we cache it here someday
       case Some(resource) => {
@@ -101,7 +104,8 @@ class Application @Inject()(files: FileService, collections: CollectionService,
         } else {
           clowderurl.getHost + ":" + clowderurl.getPort
         }
-        val user = User.anonymous 
+        val user = User.anonymous //not found: value User
+        //val dd=tree.getDatasets(true,user) //not owned by anon
         val dd = tree.getDatasets(false,user)
         var resultStr=""
         val top= """<?xml version="1.0" encoding="UTF-8"?>
@@ -113,15 +117,27 @@ class Application @Inject()(files: FileService, collections: CollectionService,
            uStr = "\n<url><loc>" + clowderurl + "/datasets/" + dd_id + "</loc></url>"
            resultStr = resultStr.concat(uStr)
         })
+        //was from route
+        //val d = scala.io.Source.fromURL(clowderurl + "/api/datasets")
+        //val sd = d.mkString
+        //val parsedJson = Json.parse(sd)
+        //val idl = (parsedJson \\ "id")
+        //idl.foreach( id => {
+        //   val id_ = id.as[String]
+        //   uStr = "\n<url><loc>" + clowderurl + "/datasets/" + id_ + "</loc></url>"
+        //   resultStr = resultStr.concat(uStr)
+        //})
+        //will rm above once getstatsets
         resultStr = resultStr +  "\n</urlset>"
-        //could still cache and read when nothing new but less likely if have to recheck permissions as well
+        //could cache, in case we want to reuse later, w/Ok(reult.mkString)
+        //_would again check cache before creating, but still problems w/:
+        //might skip as would have to recheck permissions as well
         Ok(resultStr.mkString)
       }
       case None => NotFound("Could not find sitemap.xml")
     }
   }
 
-
   /**
    * Main page.
    */

From fc1b0e91a915a9eb9b4af77676d96f1752e9ffe9 Mon Sep 17 00:00:00 2001
From: mike bobak <MBcode@users.noreply.github.com>
Date: Thu, 27 Oct 2022 16:35:06 -0500
Subject: [PATCH 5/5] Create sitemap.xml

from remote sitemap.xml branch
---
 public/sitemap.xml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 public/sitemap.xml

diff --git a/public/sitemap.xml b/public/sitemap.xml
new file mode 100644
index 000000000..a57eac995
--- /dev/null
+++ b/public/sitemap.xml
@@ -0,0 +1,6 @@
+filler that will be replaced with cached sitemap
+though that idea might be on hold if we have to
+ worry about who has access to this sitemap
+ as clowder v1 only has public and hidden
+while v2 might get a private setting where
+ you can see that it is there but not download it w/o auth